RuntimeError: CUDA out of memory. GPU Memory usage keeps on increasing

I am repeatedly getting the following error:

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.91 GiB total capacity; 10.33 GiB already allocated; 10.75 MiB free; 4.68 MiB cached)

The gpu memory usage increases and the program hits error just after first 3 epochs.

I have spent numerous hours trying out various method given on multiple forums but nothing has worked out yet. It would be really great if anyone could help me.

The code is :-

import os
import sys

import numpy as np
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
from OWMLayer import OWMLayer
import gc
os.environ[‘TF_CPP_MIN_LOG_LEVEL’] = ‘3’ # ignore warning
os.environ[“CUDA_VISIBLE_DEVICES”] = “0” # use gpu
use_cuda = torch.cuda.is_available()
device = torch.device(‘cuda:0’ if use_cuda else ‘cpu’)

Seed

seed_num = 30
np.random.seed(seed_num)
torch.manual_seed(seed_num)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed_num)
else:
print(’[CUDA unavailable]’)
sys.exit()

Hyper Pameters

class_num = 10 # mnist
num_epochs = 50
batch_size = 100
learning_rate = 2.0
dtype = torch.cuda.FloatTensor # run on GPU

MNIST Dataset

train_dataset = dsets.MNIST(root=’./data/’, train=True, transform=transforms.ToTensor(), download=True)

test_dataset = dsets.MNIST(root=’./data/’, train=False, transform=transforms.ToTensor())

Data Loader (Input Pipeline)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

def get_weight(shape, zeros=None):
np.random.seed(seed_num)
if zeros is None:
w = np.random.normal(0, 1.0, shape)
w = torch.from_numpy(w/(np.sqrt(sum(shape)/2.0)))
else:
w = np.zeros(shape)
w = torch.from_numpy(w)
return Variable(w.type(dtype), requires_grad=True)

def get_bias(shape):
bias = 0.01 * np.random.rand(shape)
bias = torch.from_numpy(bias)
return Variable(bias.type(dtype), requires_grad=True)

def get_layer(shape, alpha=0, zeros=None):
“”"
:type alpha: learningrate
“”"
w = get_weight(shape, zeros)
return w, OWMLayer(shape, alpha)

alpha = 1.0

Layer1

w1, force_layer1 = get_layer([28*28, 800], alpha=alpha)
b1 = get_bias(w1.size(1))

Layer2

w2, force_layer2 = get_layer([800, 800], alpha=alpha)
b2 = get_bias(w2.size(1))

Layer_out

wo, force_layer_out = get_layer([800, class_num], alpha=alpha)
myAFun = nn.ReLU().cuda()
myDrop = nn.Dropout(p=0.2).cuda()
criterion = nn.CrossEntropyLoss().cuda()
n = 0
lambda_loss = 1e-3
Task_num = 3

for task_index in range(Task_num):
ss = np.arange(28*28)
if task_index > 0:
np.random.seed(task_index)
np.random.shuffle(ss)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
labels = Variable(labels).cuda()
images = Variable(images).cuda()
images = images.view(-1, 28 * 28)
numpy_data = images.data.cpu().numpy()
input = torch.from_numpy(numpy_data[:, ss])
input = Variable(input.type(dtype))
# Forward + Backward + Optimize
output1 = myDrop(myAFun(input.mm(w1) + b1))

		output2 = myDrop(myAFun(output1.mm(w2) + b2))

		y_pred = output2.mm(wo)
		loss = criterion(y_pred, labels)+lambda_loss*(torch.norm(w1)+torch.norm(wo)+torch.norm(w2))
		loss.backward()

		force_layer1.force_learn(w1, input, learning_rate)
		force_layer2.force_learn(w2, output1, learning_rate)
		force_layer_out.force_learn(wo, output2, learning_rate)

		n = torch.norm(wo).data.detach().item()
		if ((i + 1) % (len(train_dataset) // batch_size)) == 0:
			print('Task [{:d}/{:d}]: Epoch [{:d}/{:d}], Iter [{:d}/{:d}] Loss: {:.3f} Norm: {:.3f}'
				  .format(task_index + 1, Task_num, epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size,
						  loss.data.detach().item(), n))

Test the Model

correct_all = []
for task_index in range(Task_num):
ss = np.arange(28 * 28)
if task_index > 0:
np.random.seed(task_index)
np.random.shuffle(ss)
correct = 0
total = 0
for images, labels in test_loader:
images = Variable(images).cuda()
images = images.view(-1, 28 * 28)
numpy_data = images.data.cuda().numpy()
input = torch.from_numpy(numpy_data[:, ss])
input = Variable(input.type(dtype))
# Forward
output1 = myAFun(input.mm(w1) + b1)

	output2 = myAFun(output1.mm(w2) + b2)

	y_pred = output2.mm(wo)

	_, predicted = torch.max(y_pred.data, 1)
	total += labels.size(0).item()
	correct += (predicted.cpu() == labels).sum()
correct_all.append((100 * correct / total))
print('Test Accuracy of the model on the 10000 Shuffled_mnist images: %0.2f %%' % (100 * correct / total))

print(“Average Test Accuracy on All Tasks: {0:.2f} %”.format(sum(correct_all) / len(correct_all)))

Did you try

nvidia-smi
sudo kill -9 PID_NUMBER

to end processes that might be running in the backend?

If you did, you can try to lower the batch size when training such as 16 or 32.

I have tried all of that but my gpu memory usage just keeps on increasing.

Your current code is a bit hard to read, so please format it by wrapping it into three backticks ```.
Also, remove the usage of Variable, as it’s deprecated since PyTorch 0.4 as well as the usage of the .data attribute, as it might yield unwanted side effects.
In the test_loader loop it seems you are not wrapping the code into a with torch.no_grad() block, so you might want to add it.

Code for OWMLayer.py

# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable
dtype = torch.FloatTensor  # run on GPU


class OWMLayer:

    def __init__(self,  shape, alpha=0):

        self.input_size = shape[0]
        self.output_size = shape[1]
        self.alpha = alpha
        self.P = Variable((1.0/self.alpha)*torch.eye(self.input_size).type(dtype))
        self.P = self.P.cuda()
    def force_learn(self, w, input_, learning_rate, alpha=1.0):  # input_(batch,input_size)
        self.r = torch.mean(input_, 0, True)
        # print(self.P.device,self.r.device)
        self.k = torch.mm(self.P, torch.t(self.r))
        self.c = 1.0 / (alpha + torch.mm(self.r, self.k))  # 1X1
        self.P.sub_(self.c*torch.mm(self.k, torch.t(self.k)))
        # print(w.grad)
        w.data -= learning_rate * torch.mm(self.P.data, w.grad.data)
        w.grad.data.zero_()

    def predit_lable(self, input_, w,):
        return torch.mm(input_, w)

Code for training the model:

import os
import sys
import time
import numpy as np
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
# from torch.autograd import  
from OWMLayer import OWMLayer
import gc
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # ignore warning
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # use gpu
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')

# Seed
seed_num = 30
np.random.seed(seed_num)
torch.manual_seed(seed_num)
if torch.cuda.is_available():
	torch.cuda.manual_seed(seed_num)
else:
	print('[CUDA unavailable]')
	sys.exit()
# Hyper Pameters
class_num = 10  # mnist
num_epochs = 50
batch_size = 100
learning_rate = 2.0
dtype = torch.FloatTensor  # run on GPU
# MNIST Dataset
train_dataset = dsets.MNIST(root='./data/', train=True, transform=transforms.ToTensor(), download=True)

test_dataset = dsets.MNIST(root='./data/', train=False, transform=transforms.ToTensor())

# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


def get_weight(shape, zeros=None):
	np.random.seed(seed_num)
	if zeros is None:
		w = np.random.normal(0, 1.0, shape)
		w = torch.from_numpy(w/(np.sqrt(sum(shape)/2.0)))
	else:
		w = np.zeros(shape)
		w = torch.from_numpy(w)
	return  (w.type(dtype).requires_grad_())


def get_bias(shape):
	bias = 0.01 * np.random.rand(shape)
	bias = torch.from_numpy(bias)
	return  (bias.type(dtype).requires_grad_())


def get_layer(shape, alpha=0, zeros=None):
	"""
	:type alpha: learningrate
	"""
	w = get_weight(shape, zeros)
	return w, OWMLayer(shape, alpha)


alpha = 1.0
# Layer1
w1, force_layer1 = get_layer([28*28, 800], alpha=alpha)
w1 = w1.cuda()
b1 = get_bias(w1.size(1))
b1 = b1.cuda()
# Layer2
w2, force_layer2 = get_layer([800, 800], alpha=alpha)
w2 = w2.cuda()
b2 = get_bias(w2.size(1))
b2 = b2.cuda()
# Layer_out
wo, force_layer_out = get_layer([800, class_num], alpha=alpha)
wo = wo.cuda()
myAFun = nn.ReLU()
myDrop = nn.Dropout(p=0.2)
criterion = nn.CrossEntropyLoss()
n = 0
lambda_loss = 1e-3
Task_num = 3


for task_index in range(Task_num):
	ss = np.arange(28*28)
	if task_index > 0:
		np.random.seed(task_index)
		np.random.shuffle(ss)
	for epoch in range(num_epochs):
		time0 = time.time()
		for i, (images, labels) in enumerate(train_loader):
			labels =  (labels).cuda()
			images =  (images).cuda()
			images = images.view(-1, 28 * 28)
			# numpy_data = images.data.cpu().numpy()
			# input = torch.from_numpy(numpy_data[:, ss])
			# input =  (input.type(dtype))
			# Forward + Backward + Optimize

			output1 = myDrop(myAFun(images.mm(w1) + b1)).cuda()

			output2 = myDrop(myAFun(output1.mm(w2) + b2))

			y_pred = output2.mm(wo)
			loss = criterion(y_pred, labels)+lambda_loss*(torch.norm(w1)+torch.norm(wo)+torch.norm(w2))
			wo.retain_grad()
			w1.retain_grad()
			w2.retain_grad()
			loss.backward()
			force_layer1.force_learn(w1, images, learning_rate)
			force_layer2.force_learn(w2, output1, learning_rate)
			force_layer_out.force_learn(wo, output2, learning_rate)

			n = torch.norm(wo).data.item()
			if ((i + 1) % (len(train_dataset) // batch_size)) == 0:
				print('Time',time.time() - time0)
				print('Task [{:d}/{:d}]: Epoch [{:d}/{:d}], Iter [{:d}/{:d}] Loss: {:.3f} Norm: {:.3f}'
					  .format(task_index + 1, Task_num, epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size,
							  loss.data.item(), n))
			# time.sleep(20)
			# gc.collect()
			#del labels,images,y_pred,numpy_data
			#torch.cuda.empty_cache()


# Test the Model
correct_all = []
# with torch.no_grad():
for task_index in range(Task_num):
	ss = np.arange(28 * 28)
	if task_index > 0:
		np.random.seed(task_index)
		np.random.shuffle(ss)
	correct = 0
	total = 0
	for images, labels in test_loader:
		# images =  (images).cuda()
		images = images.view(-1, 28 * 28)
		numpy_data = images.data.cuda().numpy()
		input = torch.from_numpy(numpy_data[:, ss])
		input =  (input.type(dtype))
		# Forward
		output1 = myAFun(input.mm(w1) + b1)

		output2 = myAFun(output1.mm(w2) + b2)

		y_pred = output2.mm(wo)

		_, predicted = torch.max(y_pred.data, 1)
		total += labels.size(0).item()
		correct += (predicted.cpu() == labels).sum()
	correct_all.append((100 * correct / total))
	print('Test Accuracy of the model on the 10000 Shuffled_mnist images: %0.2f %%' % (100 * correct / total))


print("Average Test Accuracy on All Tasks: {0:.2f} %".format(sum(correct_all) / len(correct_all)))



@ptrblck @juanko It would be really great if you can have a look at the newly put formatted code above and let me know what changes would I have to make to avoid gpu memory leakage?

One problem could be that the ouput variable is not freed after every pass of the model. It’s value and respective activations are still present when the second epoch starts.

You can do training and validation inside a function, so that there variables go out of scope after their call, and the memory is freed. Look at ‘Don’t hold to tensors’ in this Frequently Asked Questions — PyTorch 1.9.0 documentation

And yes, as @ptrblck mentioned, you should put validation under the torch.no_grad() context so it’s gradients are not computed. You don’t need them.