How does pytorch nn.module keep computation graph and when&how release it?

I got a piece of code from:

def train(train_loader, model, optimizer, epoch):
# switch to train mode
model.train()

pbar = tqdm(enumerate(train_loader))
labels, distances = [], []


for batch_idx, (data_a, data_p, data_n,label_p,label_n) in pbar:

    data_a, data_p, data_n = data_a.cuda(), data_p.cuda(), data_n.cuda()
    data_a, data_p, data_n = Variable(data_a), Variable(data_p), \
                             Variable(data_n)

    # compute output
    out_a, out_p, out_n = model(data_a), model(data_p), model(data_n)

    # Choose the hard negatives
    d_p = l2_dist.forward(out_a, out_p)
    d_n = l2_dist.forward(out_a, out_n)
    all = (d_n - d_p < args.margin).cpu().data.numpy().flatten()
    hard_triplets = np.where(all == 1)
    if len(hard_triplets[0]) == 0:
        continue
    out_selected_a = Variable(torch.from_numpy(out_a.cpu().data.numpy()[hard_triplets]).cuda())
    out_selected_p = Variable(torch.from_numpy(out_p.cpu().data.numpy()[hard_triplets]).cuda())
    out_selected_n = Variable(torch.from_numpy(out_n.cpu().data.numpy()[hard_triplets]).cuda())

    selected_data_a = Variable(torch.from_numpy(data_a.cpu().data.numpy()[hard_triplets]).cuda())
    selected_data_p = Variable(torch.from_numpy(data_p.cpu().data.numpy()[hard_triplets]).cuda())
    selected_data_n = Variable(torch.from_numpy(data_n.cpu().data.numpy()[hard_triplets]).cuda())

    selected_label_p = torch.from_numpy(label_p.cpu().numpy()[hard_triplets])
    selected_label_n= torch.from_numpy(label_n.cpu().numpy()[hard_triplets])
    triplet_loss = TripletMarginLoss(args.margin).forward(out_selected_a, out_selected_p, out_selected_n)

    cls_a = model.forward_classifier(selected_data_a)
    cls_p = model.forward_classifier(selected_data_p)
    cls_n = model.forward_classifier(selected_data_n)

    criterion = nn.CrossEntropyLoss()
    predicted_labels = torch.cat([cls_a,cls_p,cls_n])
    true_labels = torch.cat([Variable(selected_label_p.cuda()),Variable(selected_label_p.cuda()),Variable(selected_label_n.cuda())])

    cross_entropy_loss = criterion(predicted_labels.cuda(),true_labels.cuda())

    loss = cross_entropy_loss + triplet_loss
    # compute gradient and update weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # update the optimizer learning rate
    adjust_learning_rate(optimizer)

    # log loss value
    logger.log_value('triplet_loss', triplet_loss.data[0]).step()
    logger.log_value('cross_entropy_loss', cross_entropy_loss.data[0]).step()
    logger.log_value('total_loss', loss.data[0]).step()
    if batch_idx % args.log_interval == 0:
        pbar.set_description(
            'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} \t # of Selected Triplets: {}'.format(
                epoch, batch_idx * len(data_a), len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.data[0],len(hard_triplets[0])))


    dists = l2_dist.forward(out_selected_a,out_selected_n) #torch.sqrt(torch.sum((out_a - out_n) ** 2, 1))  # euclidean distance
    distances.append(dists.data.cpu().numpy())
    labels.append(np.zeros(dists.size(0)))


    dists = l2_dist.forward(out_selected_a,out_selected_p)#torch.sqrt(torch.sum((out_a - out_p) ** 2, 1))  # euclidean distance
    distances.append(dists.data.cpu().numpy())
    labels.append(np.ones(dists.size(0)))

labels = np.array([sublabel for label in labels for sublabel in label])
distances = np.array([subdist[0] for dist in distances for subdist in dist])

tpr, fpr, accuracy, val, val_std, far = evaluate(distances,labels)
print('\33[91mTrain set: Accuracy: {:.8f}\n\33[0m'.format(np.mean(accuracy)))
logger.log_value('Train Accuracy', np.mean(accuracy))

plot_roc(fpr,tpr,figure_name="roc_train_epoch_{}.png".format(epoch))

# do checkpointing
torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict()},
           '{}/checkpoint_{}.pth'.format(LOG_DIR, epoch))

as you can see,there are a bunch of model.forward(…) invoking:

out_a, out_p, out_n = model(data_a), model(data_p), model(data_n)
....
cls_a = model.forward_classifier(selected_data_a)
cls_p = model.forward_classifier(selected_data_p)
cls_n = model.forward_classifier(selected_data_n)
(model.forward_classifier() will invoke model.forward() also)

due to loss.backward() invoked,before second iteration beginning,a little bit gpu memory was release,but more than 30G gpu memory not(I have 4x1080ti),then training crashed with run out of GPU memory,so question are:

1,In a nn.module,how(or where) is computation graph saved,how to release gpu memory?

2,Must to call backward()?if yes,what should be in .backward() method?

3,In my case,which part of GPU memory is released,which part of them is kept?Is there any way to see it?