I am new to Pytorch… and am trying to train a neural network on Colab. Relatively speaking, my dataset is not very large, yet after three epochs I run out of GPU memory and get the following warning.
RuntimeError: CUDA out of memory. Tried to allocate 106.00 MiB (GPU 0; 14.73 GiB total capacity; 13.58 GiB already allocated; 63.88 MiB free; 13.73 GiB reserved in total by PyTorch)
I am really not sure where my bottleneck is. I have tried a number of things which I have found on this forum but nothing worked so far. I tried detaching my validation set from GPU when calculating validation loss, I have tried setting model to eval during validation also. Here is my model, wondering if anyone could give me some pointers as to where the bottleneck might be.
BATCH_SIZE = 8
train_X, train_Y = torch.from_numpy(train_x), torch.from_numpy(train_y)
valid_X, valid_Y = torch.from_numpy(valid_x), torch.from_numpy(valid_y)
train_X = train_X.to('cuda')
train_Y = train_Y.to('cuda')
valid_X = valid_X.to('cuda')
valid_Y = valid_Y.to('cuda')
train_df = TensorDataset(train_X, train_Y)
valid_df = TensorDataset(valid_X, valid_Y)
train_dl = DataLoader(train_df, BATCH_SIZE, shuffle=False)
class SimpleNet(nn.Module):
def __init__(self):
super().__init__()
self.linear1 = nn.Linear(NUM_FEATURES, 500)
self.act1 = nn.ReLU()
self.linear2 = nn.Linear(500, 500)
self.act2 = nn.ReLU()
self.linear3 = nn.Linear(500, NUM_LABELS)
def forward(self, x):
x = self.linear1(x)
x = self.act1(x)
x = self.linear2(x)
x = self.act2(x)
x = self.linear3(x)
return x
model = SimpleNet().to('cuda')
loss_fn = mse_loss(train_Y.float(),train_Y.float()).to('cuda')
opt = torch.optim.Adam(model.parameters(), 1e-5)
def fit(num_epochs, model, loss_fn, opt):
epoch_number = []
train_losses = []
val_losses = []
for epoch in range(num_epochs):
for xb,yb in train_dl:
y_hat = model(xb.float())
loss = loss_fn(yb, y_hat)
loss.backward()
opt.step()
opt.zero_grad()
model.eval()
train_loss = loss_fn(model(train_X.float()), train_Y.float())
val_loss = loss_fn(model(valid_X.float()), valid_Y.float())
model.train()
epoch_number.append(epoch)
train_losses.append(train_loss)
val_losses.append(val_loss)
if epoch % 10 == 0:
print('Training loss: %.3f, Validation loss: %.3f' % (train_loss, val_loss))
plt.plot(epoch_number, train_losses, label='Training Loss')
plt.plot(epoch_number, val_losses, label='Validation Loss')
plt.legend()
plt.show()
fit(200, model, mse_loss, opt)