Loss.backward() cuda runtime error (2) : out of memory at /pytorch/torch/lib/THC/generic/THCStorage.cu:66

I am using a pre-trained model - Resnet18 - to identify dog breeds. Link to dataset: Dog-Breed

This is my code:

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import numpy as np
from torchvision import datasets, models, transforms
import time
import os
from collections import Counter
import errno
import pandas
from sklearn.model_selection import StratifiedShuffleSplit
import cv2

LABELS_LOCATION = './dataset/labels.csv'
TRAIN_LOCATION = './dataset/train/'
TEST_LOCATION = './dataset/test/'
ROOT_PATH = './dataset/'
use_gpu = torch.cuda.is_available()


# Read CSV
def read_csv(csvf):
    # print(pandas.read_csv(csvf).values)
    data=pandas.read_csv(csvf).values
    labels_dict = dict(data)
    idz=list(labels_dict.keys())
    clazz=list(labels_dict.values())
    return labels_dict,idz,clazz


def create_dir(path_name):
    try:
        os.makedirs(path_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise


def class_hashmap(class_arr):
    uniq_clazz = Counter(class_arr)
    class_dict = {}
    for i, j in enumerate(uniq_clazz):
        class_dict[j] = i
    return class_dict


labels, ids, class_names = read_csv(LABELS_LOCATION)
train_images = os.listdir(TRAIN_LOCATION)
class_numbers = class_hashmap(class_names)

data_transforms = {
    'train': transforms.Scale(60),
}

images_len = len(train_images)

'''
Resizing the image to 224x224 so that we have same dimensionality while feeding the image to model
'''
resize_image = np.empty((60, 60))
resize = []
indexed_labels = []
for t_i in train_images:
    # resize.append(transform.resize(io.imread(TRAIN_LOCATION+t_i), (60, 60, 3)))  # (60,60) is the height and widht; 3 is the number of channels
    resize.append(cv2.resize(cv2.imread(TRAIN_LOCATION+t_i), (224, 224)).reshape(3, 224, 224))
    indexed_labels.append(class_numbers[labels[t_i.split('.')[0]]])

resize = np.asarray(resize)
print(resize.shape)
'''
Splitting the data into 7:1:2 (train:val:test) using stratification
'''
X = resize  # numpy array of images [training data]
y = np.array(indexed_labels)  # indexed labels for images [training labels]

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
sss.get_n_splits(X, y)


for train_index, test_index in sss.split(X, y):
    X_temp, X_test = X[train_index], X[test_index]  # split train into train and test [data]
    y_temp, y_test = y[train_index], y[test_index]  # labels

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.123, random_state=0)
sss.get_n_splits(X_temp, y_temp)

for train_index, test_index in sss.split(X_temp, y_temp):
    print("TRAIN:", train_index, "VAL:", test_index)
    X_train, X_val = X[train_index], X[test_index]  # training and validation data
    y_train, y_val = y[train_index], y[test_index]  # training and validation labels

# print(type(X_train),type(X_test),type(X_val))
# print(type(y_train),type(y_test),type(y_val))


batch_size = 500
learning_rate =0.001

'''
Data Loaders
'''

train = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=False)

val = torch.utils.data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
val_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=False)

test = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

# print(train_loader.size)

dataloaders = {
    'train': train_loader,
    'val': val_loader
}
print(dataloaders['train'])


def train_model(model, fc, criterionn, optimizer, scheduler, num_epochs=25):
    """
    :param model:
    :param criterion:
    :param optimizer:
    :param scheduler:
    :param num_epochs:
    :return:
    """

    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            print(".......phase:", phase)
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data in dataloaders[phase]:
                print(".......... inside second for dataloaders")
                # get the inputs
                inputs, i_labels = data
                print("........ input size", len(inputs))
                print("......... labels size", len(i_labels))
                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda()).float()
                    i_labels = Variable(i_labels.cuda()).float().type(torch.cuda.LongTensor)
                else:
                    inputs, i_labels = Variable(inputs).float(), Variable(i_labels).float().type(torch.LongTensor)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)
                outputs = outputs.view(outputs.size(0), -1)
                outputs = fc(outputs)
                _, preds = torch.max(outputs.data, 1)
                print("........ output size", outputs.size)
                loss = criterionn(outputs, i_labels)
                print("....... loss - ", loss)
                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.data[0]
                running_corrects += torch.sum(preds == i_labels.data)

            epoch_loss = running_loss / len(dataloaders[phase])
            epoch_acc = running_corrects / len(dataloaders[phase])

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model



'''
any Pretrained model
'''
model_ft = models.resnet18(pretrained=True)

# freeze all model parameters
# for param in model_ft.parameters():
#     param.requires_grad = False

# new final layer with 16 classes (from kaggle kernel)
print("...... total unique classes", len(class_numbers))
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, len(class_numbers))

print(".......model_ft_success.......")
if use_gpu:
    model_ft = model_ft.cuda()
    model_ft.fc = model_ft.fc.cuda()

if use_gpu:
    criterion = nn.CrossEntropyLoss().cuda()
else:
    criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.fc.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
print(".....exp_lr_scheduler.......")
model_ft = train_model(model_ft, model_ft.fc, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=25)

I am getting this error:

THCudaCheck FAIL file=/pytorch/torch/lib/THC/generic/THCStorage.cu line=66 error=2 : out of memory
Traceback (most recent call last):
  File "TL.py", line 247, in <module>
    num_epochs=25)
  File "TL.py", line 185, in train_model
    loss.backward()
  File "/home/venvs/pytorch/local/lib/python2.7/site-packages/torch/autograd/variable.py", line 156, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
  File "/home/venvs/pytorch/local/lib/python2.7/site-packages/torch/autograd/__init__.py", line 98, in backward
    variables, grad_variables, retain_graph)
RuntimeError: cuda runtime error (2) : out of memory at /pytorch/torch/lib/THC/generic/THCStorage.cu:66

What is it that I am missing here?

It looks like your GPU memory is not large enough. Maybe reduce batch size?

Ah, yeah I reduced to 200. Now it works fine.

but my batch_size is 8 also reply this error
THCudaCheck FAIL file=d:\downloads\pytorch-master-1\torch\lib\thc\generic/THCStorage.cu line=66 error=2 : out of memory
Traceback (most recent call last):
File “F:/yzy/03Pytorch_classify/train_test.py”, line 80, in
loss.backward()
File “D:\ProgramData\Anaconda3\lib\site-packages\torch\autograd\variable.py”, line 144, in backward
self._execution_engine.run_backward((self,), (gradient,), retain_variables)
File “D:\ProgramData\Anaconda3\lib\site-packages\torch\autograd\function.py”, line 90, in apply
return self._forward_cls.backward(self, *args)
File “D:\ProgramData\Anaconda3\lib\site-packages\torch\nn_functions\linear.py”, line 27, in backward
grad_weight = torch.mm(grad_output.t(), input)
File “D:\ProgramData\Anaconda3\lib\site-packages\torch\autograd\variable.py”, line 523, in mm
output = Variable(self.data.new(self.data.size(0), matrix.data.size(1)))
RuntimeError: cuda runtime error (2) : out of memory at d:\downloads\pytorch-master-1\torch\lib\thc\generic/THCStorage.cu:66

my code is
#超参数
num_epochs = 2
batch_size = 8
learning_rate = 0.001
cnn = CNN()
print(cnn)
cnn.cuda()
if name == ‘main’:
#误差和优化
loss_fun = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(cnn.parameters(), lr=learning_rate)

#训练模型
cnn.train()
for epoch in range(num_epochs):
    train_loss = 0
    train_acc = 0
    for i, (images, labels) in enumerate(train_loader):
        images = Variable(images).cuda()
        #print(labels)
        labels = Variable(labels).cuda()
        #print(labels)
        #前向传播 反向传播 优化
        optimizer.zero_grad()
        output = cnn(images)
        loss = loss_fun(output, labels)
        loss.backward()
        optimizer.step()

It is possible that batch size 8 is still too large for your model and environment.

in a multi-task model , even i set the batch-size to 2, it still report out of mem. error…