Hi Guys,
Initially, I was able to train my model but it performed really poorly, so I did some augmentation on the images and tried to retrain to improve but got this memory error. However, I used transfer learning (MobileNetV2 and ResNext50) and I was able to run the same model, but with slightly better accuracy (not that great).
I’ve readjusted the convolution layers as well as fully connected layers, all to no avail.
Data:
train_transforms = transforms.Compose([transforms.Resize((256,256)),
transforms.CenterCrop((120,120)),
transforms.ColorJitter(brightness=0.5),
transforms.RandomGrayscale(p=0.2),
transforms.RandomRotation(degrees=30, interpolation=PIL.Image.BILINEAR),
transforms.ToTensor(),
transforms.Normalize(torch.Tensor(mean), torch.Tensor(std))])
valid_transforms = transforms.Compose([transforms.Resize((256,256)),
transforms.CenterCrop((120,120)),
transforms.ToTensor(),
transforms.Normalize(torch.Tensor(mean), torch.Tensor(std))])
Model Summary
Sequential(
(conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(batchnorm1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU()
(pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(batchnorm2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU()
(pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(batchnorm3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu3): ReLU()
(pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(flatten): Flatten(start_dim=1, end_dim=-1)
(fc1): Linear(in_features=28800, out_features=15000, bias=False)
(fbatchnorm1): BatchNorm1d(15000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu4): ReLU()
(dropout1): Dropout(p=0.7, inplace=False)
(fc2): Linear(in_features=15000, out_features=7000, bias=False)
(fbatchnorm2): BatchNorm1d(7000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu5): ReLU()
(dropout2): Dropout(p=0.7, inplace=False)
(fc3): Linear(in_features=7000, out_features=3000, bias=False)
(fbatchnorm3): BatchNorm1d(3000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu6): ReLU()
(dropout3): Dropout(p=0.7, inplace=False)
(fc6): Linear(in_features=3000, out_features=8, bias=True)
model = model.to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
def train(model, num_epochs, train_dl, valid_dl):
loss_hist_train = [0]*num_epochs
accuracy_hist_train = [0]*num_epochs
loss_hist_valid = [0]*num_epochs
accuracy_hist_valid = [0]*num_epochs
for epoch in range(num_epochs):
model.train()
for x_batch, y_batch in train_dl:
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
pred = model(x_batch)
loss = loss_func(pred, y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss_hist_train[epoch] += loss.item()*y_batch.size(0)
is_correct = (torch.argmax(pred, dim=1)==y_batch).float()
accuracy_hist_train[epoch] += is_correct.sum().cpu()
loss_hist_train[epoch] /= len(train_dl.dataset)
accuracy_hist_train[epoch] /= len(train_dl.dataset)
model.eval()
with torch.no_grad():
for x_batch, y_batch in valid_dl:
x_batch = x_batch.to(device)
pred = model(x_batch)
y_batch = y_batch.to(device)
loss = loss_func(pred, y_batch)
loss_hist_valid[epoch] += loss.item()*y_batch.size(0)
is_correct = (torch.argmax(pred, dim=1)==y_batch).float()
accuracy_hist_valid[epoch] += is_correct.sum().cpu()
loss_hist_valid[epoch] /= len(valid_dl.dataset)
accuracy_hist_valid[epoch] /= len(valid_dl.dataset)
print(f'Epoch {epoch+1} accuracy: '
f'{accuracy_hist_train[epoch]:.4f} val_accuracy: '
f'{accuracy_hist_valid[epoch]:.4f}')
return loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid
Error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [23], in <cell line: 3>()
1 torch.manual_seed(123)
2 num_epochs=100
----> 3 model_sum = train(model, num_epochs, train_dataset, valid_dataset)
Input In [22], in train(model, num_epochs, train_dl, valid_dl)
14 pred = model(x_batch)
15 loss = loss_func(pred, y_batch)
---> 17 loss.backward()
18 optimizer.step()
19 optimizer.zero_grad()
File ~\anaconda3\envs\myai\lib\site-packages\torch\_tensor.py:396, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
387 if has_torch_function_unary(self):
388 return handle_torch_function(
389 Tensor.backward,
390 (self,),
(...)
394 create_graph=create_graph,
395 inputs=inputs)
--> 396 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File ~\anaconda3\envs\myai\lib\site-packages\torch\autograd\__init__.py:173, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
168 retain_graph = create_graph
170 # The reason we repeat same the comment below is that
171 # some Python versions print out the first line of a multi-line function
172 # calls in the traceback and some print out the last line
--> 173 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
174 tensors, grad_tensors_, retain_graph, create_graph, inputs,
175 allow_unreachable=True, accumulate_grad=True)
RuntimeError: CUDA out of memory. Tried to allocate 1.61 GiB (GPU 0; 6.00 GiB total capacity; 4.19 GiB already allocated; 0 bytes free; 4.21 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Batch_size=4, num_epochs=100, any advice would be appreciated
Thanks