Register_hook lead to cuda out of memory

when I use register_hook to try to print some layer’s info of gradient, when I do not use torch.nn.DataParallel(), no gpu memory leak happen ,however when I try to use torch.nn.DataParallel(). The cuda:0 card’s memory is increasing after running register_hook function and making inference the network, what is the reason of it and how to solve it ,can anyone help?
pytorch edition 1.2.0 python 2.7 cuda 10.0

class CaffeModel(nn.Module):
def init(self, basemodel_protofile, height=384, width=128):
super(CaffeModel, self).init()

    self.base = CaffeNet(basemodel_protofile, width=width, height=height, omit_data_layer=True, phase='TRAIN')

def forward(self, x, key_, gradkeylt = [], normalize = True, test = False):
    x = self.base(x)
    y = x[key_]
    if normalize:
        y = F.normalize(y, p=2, dim=1)
    if not test:
        y = y * 200
    for gradkey in gradkeylt:
        if gradkey in x.keys() and x[gradkey].device.index == 0:
    return y

def load_checkpoint_for_basemodel(self, checkpoint):
    print("Loading checkpoint ....")
    model_dict = self.base.state_dict()
    pretrained_dict = torch.load(checkpoint)
    new_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict.keys()}
    print('Total : {}, update: {}'.format(len(pretrained_dict), len(new_dict)))
    print("loaded finished!")

def bh(self, grad_):
    print("max:%.8f, min:%.8f, std:%.8f, mean:%.8f" % (grad_.abs().max().item(), grad_.abs().min().item(), grad_.abs().std().item(), grad_.abs().mean().item()))
    del grad_

running into same problem. Any solutions?