I am new to Pytorch… and am trying to train a neural network on Colab. Relatively speaking, my dataset is not very large, yet after three epochs I run out of GPU memory and get the following warning.
RuntimeError: CUDA out of memory. Tried to allocate 106.00 MiB (GPU 0; 14.73 GiB total capacity; 13.58 GiB already allocated; 63.88 MiB free; 13.73 GiB reserved in total by PyTorch)
I am really not sure where my bottleneck is. I have tried a number of things which I have found on this forum but nothing worked so far. I tried detaching my validation set from GPU when calculating validation loss, I have tried setting model to eval during validation also. Here is my model, wondering if anyone could give me some pointers as to where the bottleneck might be.
BATCH_SIZE = 8 train_X, train_Y = torch.from_numpy(train_x), torch.from_numpy(train_y) valid_X, valid_Y = torch.from_numpy(valid_x), torch.from_numpy(valid_y) train_X = train_X.to('cuda') train_Y = train_Y.to('cuda') valid_X = valid_X.to('cuda') valid_Y = valid_Y.to('cuda') train_df = TensorDataset(train_X, train_Y) valid_df = TensorDataset(valid_X, valid_Y) train_dl = DataLoader(train_df, BATCH_SIZE, shuffle=False)
class SimpleNet(nn.Module): def __init__(self): super().__init__() self.linear1 = nn.Linear(NUM_FEATURES, 500) self.act1 = nn.ReLU() self.linear2 = nn.Linear(500, 500) self.act2 = nn.ReLU() self.linear3 = nn.Linear(500, NUM_LABELS) def forward(self, x): x = self.linear1(x) x = self.act1(x) x = self.linear2(x) x = self.act2(x) x = self.linear3(x) return x
model = SimpleNet().to('cuda') loss_fn = mse_loss(train_Y.float(),train_Y.float()).to('cuda') opt = torch.optim.Adam(model.parameters(), 1e-5) def fit(num_epochs, model, loss_fn, opt): epoch_number =  train_losses =  val_losses =  for epoch in range(num_epochs): for xb,yb in train_dl: y_hat = model(xb.float()) loss = loss_fn(yb, y_hat) loss.backward() opt.step() opt.zero_grad() model.eval() train_loss = loss_fn(model(train_X.float()), train_Y.float()) val_loss = loss_fn(model(valid_X.float()), valid_Y.float()) model.train() epoch_number.append(epoch) train_losses.append(train_loss) val_losses.append(val_loss) if epoch % 10 == 0: print('Training loss: %.3f, Validation loss: %.3f' % (train_loss, val_loss)) plt.plot(epoch_number, train_losses, label='Training Loss') plt.plot(epoch_number, val_losses, label='Validation Loss') plt.legend() plt.show() fit(200, model, mse_loss, opt)