RuntimeError: cuda runtime error (2) : out of memory at /pytorch/aten/src/THC/generic/THCStorage.cu:58

When I trained my network, I got this error. My network is modified from VGG16 and I use batchsize 1. During the first epoch, after almost 2900 steps, the error throw. I think my network and code is correct . I cannot figure out where are wrong. Could anybody help me?

Yes, it also occur some time to me, what I try to fix is type nvidia-smi in ternmial, and kill the process of python program and then again open it and run it, it would solve the problem

but there is almost no other program except this program is running in my computer and the memory for my card is enough, I wonder if there are some varialbles or something need to be cleared but I do not do that, but I cannot find out what is it

Kill this program, and then reopen it, hope it would help you


problem is at loss.backward(), but I cannot figure out what’s wrong with my code.

Since this error does only occur after several iterations, it seems, that you are unintentionally keeping memory allocated.

This is often the case while logging/calculating some metrics. Could you post your code, so we can have a look at it?

my train code is as follows:

def train_model(model, criterion, optimizer, scheduler, num_epochs=30):
since = time.time()

best_model = model
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
preheight = 0
prewidth = 0



for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 20)

    # Each epoch has a training and validation phase
    for phase in ['train', 'val']:
        if phase == 'train':
            #print ("phase is ", phase)
            scheduler.step()
            model.train()
            
        else:
            model.eval()
            

        running_loss = 0.0
        running_corrects = 0

        batchsize = 1
        labels_tensor = torch.zeros(batchsize)
        batchcounter = 0
        counter=0


        # Iterate over data.
        txn = icdarTrainDataset.begin()
        cursor = txn.cursor()

        length = txn.stat()['entries']
        
       
        for key, value in cursor:     
            raw_datum = txn.get(key)                

            datum = caffe.proto.caffe_pb2.Datum() 
            datum.ParseFromString(raw_datum)
            flat_x = np.fromstring(datum.data, dtype=np.uint8) 
                           
            inputs = flat_x.reshape(datum.channels, datum.height, datum.width)
          
            if datum.height>datum.width:
               datum.width = 336
               ratio = datum.height/math.floor(datum.width)
               datum.height = int(ratio*336)
            else:
               datum.height = 336
               ratio = datum.height/math.floor(datum.width)
               datum.width = int(336/ratio)

            if datum.height%16 != 0:
               datum.height = int(16* math.floor(datum.height/16))

            
            if datum.width%16 != 0:
               datum.width = 16*int(datum.width/16)

            
            c,h,w = inputs.shape
            
            inputs = np.transpose(inputs,(1,2,0))            
            inputs = cv2.resize(inputs,(datum.width,datum.height))
           


            h,w,c = inputs.shape                
            inputs = np.transpose(inputs,(2,0,1))
           

            
            inputs = torch.Tensor(inputs)
            labels = datum.label

  
                                                      
            labels_tensor[batchcounter] = labels
            
            if batchcounter == 0: 
               inputs_tensor = torch.zeros(batchsize, datum.channels, datum.height, datum.width)  
            

            inputs = inputs.view(batchsize,datum.channels, datum.height, datum.width)    
            inputs_tensor[batchcounter,:] = inputs
            batchcounter+=1
            
            if(batchcounter == batchsize):
               optimizer.zero_grad()
               batchcounter = 0

               with torch.set_grad_enabled(phase == 'train'):
                    finetuned_params = list(map(id, model.conv5_4n.parameters()))
                    finetuned_params1 = list(map(id, model.ChannelWise_attentionn.parameters()))
                    finetuned_params2 = list(map(id, model.attn1.parameters()))
                    finetuned_params3 = list(map(id, model.attn2.parameters()))
            #print ("finetuned params: ", finetuned_params)

            base_params = filter(lambda p: id(p) not in finetuned_params+finetuned_params1+finetuned_params2+finetuned_params3, model.parameters())
                 
            for param in base_params: 
               param.requires_grad = False


                    labels_tensor = labels_tensor.long()
                    inputs_tensor = inputs_tensor.to(device)
                    labels_tensor = labels_tensor.to(device)

                    
                    outputs = model.forward(inputs_tensor)                       
                    _, preds = torch.max(outputs, 1)
                    
                    
                    
                    outputs = outputs.view(batchsize,2)
                    preds=preds.view(batchsize,-1)
                    print ('outputs',outputs)
                    print ('labels_tensor',labels_tensor)

                    loss = criterion(outputs,labels_tensor)                  
                    
                    if counter%50==0:
                       print ("loss= :", loss.item())
                       print("Reached iteration ",counter)
                    
                    counter+=1

                    # backward + optimize only if in training phase
                    if phase == 'train':
                       optimizer.zero_grad()
                       loss.backward()                           
                       optimizer.step()
                       
            
               # print evaluation statistics
               try:
                    running_loss += loss.item() * inputs.size(0)
                    #print ('preds={} lables={}'.format(preds, labels_tensor.data))
                    running_corrects += torch.sum(preds == labels_tensor.data)
                    print ('running_corrects {} counts {}'.format(running_corrects, counter))
               except:
                    print('unexpected error, could not calculate loss or do a sum.')
        
    print('trying epoch loss')
    epoch_loss = running_loss / length
    epoch_acc = running_corrects.double() / length
    print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))


    #model.save_state_dict ('newWeakmodel_{}.pth'.format(num_epochs))
            


    # deep copy the model
    if phase == 'val':
        if USE_TENSORBOARD:
            foo.add_scalar_value('epoch_loss',epoch_loss,step=epoch)
            foo.add_scalar_value('epoch_acc',epoch_acc,step=epoch)
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            print('new best accuracy = ',best_acc)


time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))        
print('Best val Acc: {:4f}'.format(best_acc))
print('returning and looping back')

best_model_wts = copy.deepcopy(model.state_dict())
# load best model weights
model.load_state_dict(best_model_wts)
return model

Today I happen to face the same error. I was sending 2 images of size 320x240 each and I was getting out of memory exception. I reduced the image sizes to 180x180 each and my model ran perfectly. This could be one of the solution.

Do you solve the problem! I meet the same problem of you! If you have solved it already, can you tell me how to do?