Cuda:out of Memory error

Hi,
I am trying to train a model using k80.
I get this error
RuntimeError: CUDA out of memory. Tried to allocate 1.38 GiB (GPU 0; 11.17 GiB total capacity; 10.05 GiB already allocated; 514.31 MiB free; 10.28 GiB reserved in total by PyTorch)
I tried training the model with a much smaller batch size, but the training becomes extremely slow (2hrs per epoch)
Is there a way to fix this issue?
Following is my code:

def train_epoch(model, training_data, optimizer, pred_loss_func, opt):
    """ Epoch operation in training phase. """
    model.train()
    total_event_ll = 0  # cumulative event log-likelihood
    total_time_se = 0  # cumulative time prediction squared-error
    total_event_rate = 0  # cumulative number of correct prediction
    total_num_event = 0  # number of total events
    total_num_pred = 0  # number of predictions
    for batch in tqdm(training_data, mininterval=2,
                      desc='  - (Training)   ', leave=False):
        """ prepare data """
        event_time, time_gap, event_type = map(lambda x: x.to(opt.device), batch)
        """ forward """
        optimizer.zero_grad()
        enc_out, prediction = model(event_type, event_time) #BR: Output predictions
        """ backward """
        # negative log-likelihood
        event_ll, non_event_ll = Utils.log_likelihood(model, enc_out, event_time, event_type)
        event_loss = -torch.sum(event_ll - non_event_ll)
        # type prediction
        pred_loss, pred_num_event = Utils.type_loss(prediction[0], event_type, pred_loss_func)
        # time prediction
        se = Utils.time_loss(prediction[1], event_time)
        # SE is usually large, scale it to stabilize training
        scale_time_loss = 100
        loss = event_loss + pred_loss + se / scale_time_loss
        loss.backward()
        """ update parameters """
        optimizer.step()
        """ note keeping """
        total_event_ll += -event_loss.item()
        total_time_se += se.item()
        total_event_rate += pred_num_event.item()
        total_num_event += event_type.ne(Constants.PAD).sum().item()
        # we do not predict the first event
        total_num_pred += event_type.ne(Constants.PAD).sum().item() - event_time.shape[0]
        del event_time
        del time_gap
        del event_type
#         del se
#         del pred_loss
#         del pred_num_event
#         del event_loss
    rmse = np.sqrt(total_time_se / total_num_pred)
    return total_event_ll / total_num_event, total_event_rate / total_num_pred, rmse