CUDA out of memory. Tried to allocate 1.61 GiB

Hi Guys,

Initially, I was able to train my model but it performed really poorly, so I did some augmentation on the images and tried to retrain to improve but got this memory error. However, I used transfer learning (MobileNetV2 and ResNext50) and I was able to run the same model, but with slightly better accuracy (not that great).

I’ve readjusted the convolution layers as well as fully connected layers, all to no avail.

Data:

train_transforms = transforms.Compose([transforms.Resize((256,256)),
                                       transforms.CenterCrop((120,120)),
                                       transforms.ColorJitter(brightness=0.5),
                                       transforms.RandomGrayscale(p=0.2),
                                       transforms.RandomRotation(degrees=30, interpolation=PIL.Image.BILINEAR),
                                       transforms.ToTensor(),
                                       transforms.Normalize(torch.Tensor(mean), torch.Tensor(std))])

valid_transforms = transforms.Compose([transforms.Resize((256,256)),
                                       transforms.CenterCrop((120,120)),
                                       transforms.ToTensor(),
                                       transforms.Normalize(torch.Tensor(mean), torch.Tensor(std))])

Model Summary

Sequential(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (batchnorm1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (batchnorm2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (batchnorm3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU()
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=28800, out_features=15000, bias=False)
  (fbatchnorm1): BatchNorm1d(15000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu4): ReLU()
  (dropout1): Dropout(p=0.7, inplace=False)
  (fc2): Linear(in_features=15000, out_features=7000, bias=False)
  (fbatchnorm2): BatchNorm1d(7000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu5): ReLU()
  (dropout2): Dropout(p=0.7, inplace=False)
  (fc3): Linear(in_features=7000, out_features=3000, bias=False)
  (fbatchnorm3): BatchNorm1d(3000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu6): ReLU()
  (dropout3): Dropout(p=0.7, inplace=False)
  (fc6): Linear(in_features=3000, out_features=8, bias=True)

model = model.to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

def train(model, num_epochs, train_dl, valid_dl):
    loss_hist_train = [0]*num_epochs
    accuracy_hist_train = [0]*num_epochs
    loss_hist_valid = [0]*num_epochs
    accuracy_hist_valid = [0]*num_epochs
    
    for epoch in range(num_epochs):
        model.train()
        for x_batch, y_batch in train_dl:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            pred = model(x_batch)
            loss = loss_func(pred, y_batch)
                
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_hist_train[epoch] += loss.item()*y_batch.size(0)
            is_correct = (torch.argmax(pred, dim=1)==y_batch).float()
            accuracy_hist_train[epoch] += is_correct.sum().cpu()
        loss_hist_train[epoch] /= len(train_dl.dataset)
        accuracy_hist_train[epoch] /= len(train_dl.dataset)
        
        model.eval()
        
        with torch.no_grad():
            for x_batch, y_batch in valid_dl:
                x_batch = x_batch.to(device)
                pred = model(x_batch)
                y_batch = y_batch.to(device)
                loss = loss_func(pred, y_batch)
                loss_hist_valid[epoch] += loss.item()*y_batch.size(0)
                is_correct = (torch.argmax(pred, dim=1)==y_batch).float()
                accuracy_hist_valid[epoch] += is_correct.sum().cpu()
            loss_hist_valid[epoch] /= len(valid_dl.dataset)
            accuracy_hist_valid[epoch] /= len(valid_dl.dataset)
            
            print(f'Epoch {epoch+1} accuracy: '
                  f'{accuracy_hist_train[epoch]:.4f} val_accuracy: '
                  f'{accuracy_hist_valid[epoch]:.4f}')
    return loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid

Error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Input In [23], in <cell line: 3>()
      1 torch.manual_seed(123)
      2 num_epochs=100
----> 3 model_sum = train(model, num_epochs, train_dataset, valid_dataset)

Input In [22], in train(model, num_epochs, train_dl, valid_dl)
     14 pred = model(x_batch)
     15 loss = loss_func(pred, y_batch)
---> 17 loss.backward()
     18 optimizer.step()
     19 optimizer.zero_grad()

File ~\anaconda3\envs\myai\lib\site-packages\torch\_tensor.py:396, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    387 if has_torch_function_unary(self):
    388     return handle_torch_function(
    389         Tensor.backward,
    390         (self,),
   (...)
    394         create_graph=create_graph,
    395         inputs=inputs)
--> 396 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)

File ~\anaconda3\envs\myai\lib\site-packages\torch\autograd\__init__.py:173, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    168     retain_graph = create_graph
    170 # The reason we repeat same the comment below is that
    171 # some Python versions print out the first line of a multi-line function
    172 # calls in the traceback and some print out the last line
--> 173 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    174     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    175     allow_unreachable=True, accumulate_grad=True)

RuntimeError: CUDA out of memory. Tried to allocate 1.61 GiB (GPU 0; 6.00 GiB total capacity; 4.19 GiB already allocated; 0 bytes free; 4.21 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Batch_size=4, num_epochs=100, any advice would be appreciated

Thanks