CUDA out of memory unexpectedly - GPU error or code?

I an using Nvidia 1660 4GB. I was mainly implementing (just added the test set manually) This code

My code was:


import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import seaborn as sn
import pandas as pd
import torchnet.meter.confusionmeter as cm

# Just normalization for validation & test
# Just normalization for validation & test
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

data_dir = 'output'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#lists for graph generation
epoch_counter_train = []
epoch_counter_val = []
train_loss = []
val_loss = []
train_acc = []
val_acc = []

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()
            
            

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

#Using a model pre-trained on ImageNet and replacing it's final linear layer

# #For resnet18
# model_ft = models.resnet18(pretrained=False)
# num_ftrs = model_ft.fc.in_features
# model_ft.fc = nn.Linear(num_ftrs, len(class_names))


#for VGG16_BN
model_ft = models.vgg16_bn(pretrained=True)
model_ft.classifier[6].out_features = len(class_names)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=35)

Output:

Epoch 0/34
----------
Traceback (most recent call last):

  File "<ipython-input-13-7ec6f3e86735>", line 2, in <module>
    num_epochs=35)

  File "<ipython-input-11-3713f936f313>", line 101, in train_model
    outputs = model(inputs)

  File "C:\Users\frmas\anaconda3\envs\tfgpu\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "C:\Users\frmas\anaconda3\envs\tfgpu\lib\site-packages\torchvision\models\vgg.py", line 43, in forward
    x = self.features(x)

  File "C:\Users\frmas\anaconda3\envs\tfgpu\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "C:\Users\frmas\anaconda3\envs\tfgpu\lib\site-packages\torch\nn\modules\container.py", line 117, in forward
    input = module(input)

  File "C:\Users\frmas\anaconda3\envs\tfgpu\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "C:\Users\frmas\anaconda3\envs\tfgpu\lib\site-packages\torch\nn\modules\conv.py", line 419, in forward
    return self._conv_forward(input, self.weight)

  File "C:\Users\frmas\anaconda3\envs\tfgpu\lib\site-packages\torch\nn\modules\conv.py", line 416, in _conv_forward
    self.padding, self.dilation, self.groups)

RuntimeError: CUDA out of memory. Tried to allocate 196.00 MiB (GPU 0; 4.00 GiB total capacity; 2.45 GiB already allocated; 85.14 MiB free; 2.47 GiB reserved in total by PyTorch)

The main issue is I ran the code a few days earlier and it was successful. Even I checked the GPU usage while running it was taking around 1600MB. But today when I run the train_model the memory usage jump from 1GB to 3.5GB.

I tried lowering batch size. No luck. The previous day I did it twice with batch size 16 and 64 for my dataset. It worked for both cases.

The size of my dataset is total 753MB.

These are the details about my Nvidia GPU

Wed Sep 16 17:52:21 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 451.83       Driver Version: 451.83       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 1650   WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   56C    P0    15W /  N/A |   3417MiB /  4096MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     16100      C   ...da3\envs\tfgpu\python.exe    N/A      |
+-----------------------------------------------------------------------------+

Are you running a notebook or a script? Because When the script stops running, GPU releases the memory so it may not show up in nvidia-smi.

I am using spyder and yes a script.