Hi,
I am trying to train a model using k80.
I get this error
RuntimeError: CUDA out of memory. Tried to allocate 1.38 GiB (GPU 0; 11.17 GiB total capacity; 10.05 GiB already allocated; 514.31 MiB free; 10.28 GiB reserved in total by PyTorch)
I tried training the model with a much smaller batch size, but the training becomes extremely slow (2hrs per epoch)
Is there a way to fix this issue?
Following is my code:
def train_epoch(model, training_data, optimizer, pred_loss_func, opt):
""" Epoch operation in training phase. """
model.train()
total_event_ll = 0 # cumulative event log-likelihood
total_time_se = 0 # cumulative time prediction squared-error
total_event_rate = 0 # cumulative number of correct prediction
total_num_event = 0 # number of total events
total_num_pred = 0 # number of predictions
for batch in tqdm(training_data, mininterval=2,
desc=' - (Training) ', leave=False):
""" prepare data """
event_time, time_gap, event_type = map(lambda x: x.to(opt.device), batch)
""" forward """
optimizer.zero_grad()
enc_out, prediction = model(event_type, event_time) #BR: Output predictions
""" backward """
# negative log-likelihood
event_ll, non_event_ll = Utils.log_likelihood(model, enc_out, event_time, event_type)
event_loss = -torch.sum(event_ll - non_event_ll)
# type prediction
pred_loss, pred_num_event = Utils.type_loss(prediction[0], event_type, pred_loss_func)
# time prediction
se = Utils.time_loss(prediction[1], event_time)
# SE is usually large, scale it to stabilize training
scale_time_loss = 100
loss = event_loss + pred_loss + se / scale_time_loss
loss.backward()
""" update parameters """
optimizer.step()
""" note keeping """
total_event_ll += -event_loss.item()
total_time_se += se.item()
total_event_rate += pred_num_event.item()
total_num_event += event_type.ne(Constants.PAD).sum().item()
# we do not predict the first event
total_num_pred += event_type.ne(Constants.PAD).sum().item() - event_time.shape[0]
del event_time
del time_gap
del event_type
# del se
# del pred_loss
# del pred_num_event
# del event_loss
rmse = np.sqrt(total_time_se / total_num_pred)
return total_event_ll / total_num_event, total_event_rate / total_num_pred, rmse