Torch.not enough memory

vahid_sadeghi · February 5, 2020, 11:46am

Hello all
I tried to run a convolutional neural network on GPU. for the first time it run and I got the results but for the second time I got the below error while I didn’t change my code at all and it is weird.

$Torch: not enough memory: you tried to allocate 0GB. Buy new RAM!

I couldn’t figure out why this might be happening.
how can I solve this problem?
Any help will be appreciated
here is my code:

-- coding: utf-8 --

“”"
Created on Thu Nov 7 13:25:00 2019

@author: Vahid
“”"
from torch.autograd import Variable
import torch
import torch.cuda as cuda
import torchvision
from torchvision import transforms
import torch.nn as nn
import matplotlib.pyplot as plt

Parameters

batch_size =64
n_class =4
lr = 0.001
num_epochs =25
transform=transforms.Compose([transforms.Resize((128,128)),
transforms.ToTensor(),
transforms.Normalize((.5,.5,.5),(.5,.5,.5))])

Load Custom Dataset

train_dataset = torchvision.datasets.ImageFolder(‘H:\vahid-spectrogram-class1\train’,
transform=transform)
valid_dataset = torchvision.datasets.ImageFolder(‘H:\vahid-spectrogram-class1\validation’,
transform=transform)
test_dataset = torchvision.datasets.ImageFolder(‘H:\vahid-spectrogram-class1\test’,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=True)

Convolutional neural network

class convnet(nn.Module):
def init(self):
super(convnet,self).init()
self.layer1 = nn.Sequential(nn.Conv2d(3,16, 5,1,2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(2, 2))
self.layer2 = nn.Sequential(nn.Conv2d(16, 8, 5,1,2),
nn.BatchNorm2d(8),
nn.ReLU(),
nn.MaxPool2d(2, 2))
self.layer3 = nn.Sequential(nn.Conv2d(8,4, 5,1,2),
nn.BatchNorm2d(4),
nn.ReLU(),
nn.MaxPool2d(2, 2))
self.fc1 = nn.Linear(16164, 1000)
self.drop_out = nn.Dropout(p=.75)
self.fc2 = nn.Linear(1000,n_class)

def forward(self, x):
    out = self.layer1(x)
    out = self.layer2(out)
    out = self.layer3(out)
    out = out.reshape(out.size(0), -1)
    out = self.fc1(out)
    out = self.drop_out(out)
    out = self.fc2(out)
    return out

Model CNN

convmodel = convnet()
if cuda.is_available():
convmodel=convmodel.cuda()

loss

loss_fn = nn.CrossEntropyLoss()

Optimizer

optimizer= torch.optim.Adam(convmodel.parameters(), lr=lr)

LR

lr_sch = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5)
num_steps = len(train_loader)
valid_num_steps = len(valid_loader)
train_loss = []
valid_loss = []
train_accuracy = []
valid_accuracy = []
for epoch in range(num_epochs):
iter_loss = 0.0
correct = 0
iterations = 0
convmodel.train()
#print(lr_sch.get_lr())
for j, (imgs, lbls) in enumerate(train_loader):

            # Convert torch tensor to Variable
    imgs= Variable(imgs)
    lbls= Variable(lbls)
    
    # If we have GPU, shift the data to GPU
    if cuda.is_available():
        imgs= imgs.cuda()
        lbls =lbls.cuda()
    optimizer.zero_grad()
    out = convmodel(imgs)
    loss_tr = loss_fn(out, lbls)
    iter_loss += loss_tr.item()
    loss_tr.backward()
    optimizer.step()
    _, predicted = torch.max(out.data, 1)
    correct += (predicted == lbls.data).sum()
    #if (j+1) % 1 == 0:
    # print('Train, Epoch [{}/{}] Step [{}/{}] Loss_Train: {:.2f} Accuracy_Train:{:.2f}'.format(epoch+1, num_epochs, j+1, num_steps, loss_tr.item(),100*(correct.item()/(batch_size*(j+1)))))
    iterations += 1
    # Record the training loss
train_loss.append(iter_loss/iterations)
# Record the training accuracy
train_accuracy.append(100*correct/39888)
loss= 0.0
correct = 0
iterations = 0
convmodel.eval() 
for j, (items, classes) in enumerate(valid_loader):
    
    items=Variable(items)
    classes=Variable(classes)
            # If we have GPU, shift the data to GPU
    if cuda.is_available():
        items = items.cuda()
        classes = classes.cuda()
    out= convmodel(items)      # Do the forward pass
    loss += loss_fn(out,classes.data)# Calculate the loss
    _, predicted = torch.max(out.data, 1)
    correct += (predicted == classes.data).sum()
    #if (j+1) % 1 == 0:
      #print('validation, Epoch [{}/{}] Step [{}/{}] Loss_Validation: {:.2f} Accuracy_Validation:{:.2f} '.format(epoch+1, num_epochs, j+1, valid_num_steps, loss_va.item(),100*(correct.item()/(batch_size*(j+1)))))

    iterations += 1

# Record the validation loss
valid_loss.append(loss/iterations)
# Record the validation accuracy
valid_accuracy.append(correct/8568)
print ('Epoch %d/%d, Tr Loss: %.4f, Tr Acc: %.4f, Val Loss: %.4f, Val Acc: %.4f'
       %(epoch+1, num_epochs, train_loss[-1], train_accuracy[-1], 
         valid_loss[-1], valid_accuracy[-1]))

convmodel.eval()
corrects = 0
num_steps = len(test_loader)
for j, (imgs, lbls) in enumerate(test_loader):
out = convmodel(imgs)
predicted = torch.argmax(out, 1)
corrects += torch.sum(predicted == lbls)
print('Step [{}/{}] Acc {:.4f}: '.format(j+1, num_steps, 100.*corrects/((j+1)*batch_size)))

f = plt.figure(figsize=(10, 8))
plt.plot(train_loss, label=‘training loss’)
plt.plot(valid_loss, label=‘validation loss’)
plt.legend()
plt.show()

In[13]:

Accuracy

f = plt.figure(figsize=(10, 8))
plt.plot(train_accuracy, label=‘training accuracy’)
plt.plot(valid_accuracy, label=‘validation accuracy’)
plt.legend()
plt.show()

albanD · February 5, 2020, 1:28pm

This error means that you are running out of RAM. Do you have other people using the same machine at the same time? Can you check if other processes are using RAM on your machine?

vahid_sadeghi · February 5, 2020, 4:01pm

Thanks for your answer
For the first time i run my code and i got good results but for the second time i got the Cuda error:not enough memory
Only my code is run on the system
Would you hepl me to solve the problem?
I get stuck in for a couple of
Days

albanD · February 5, 2020, 7:08pm

A cuda error? The error you mentionned above “Torch: not enough memory: you tried to allocate 0GB. Buy new RAM!” is about CPU memory only.
Or do you see another error message?