when I use register_hook to try to print some layer’s info of gradient, when I do not use torch.nn.DataParallel(), no gpu memory leak happen ,however when I try to use torch.nn.DataParallel(). The cuda:0 card’s memory is increasing after running register_hook function and making inference the network, what is the reason of it and how to solve it ,can anyone help?
pytorch edition 1.2.0 python 2.7 cuda 10.0
class CaffeModel(nn.Module):
def init(self, basemodel_protofile, height=384, width=128):
super(CaffeModel, self).init()
self.base = CaffeNet(basemodel_protofile, width=width, height=height, omit_data_layer=True, phase='TRAIN')
def forward(self, x, key_, gradkeylt = [], normalize = True, test = False):
x = self.base(x)
y = x[key_]
if normalize:
y = F.normalize(y, p=2, dim=1)
if not test:
y = y * 200
for gradkey in gradkeylt:
if gradkey in x.keys() and x[gradkey].device.index == 0:
ipdb.set_trace()
print(gradkey)
x[gradkey].register_hook(self.bh)
return y
def load_checkpoint_for_basemodel(self, checkpoint):
print("Loading checkpoint ....")
model_dict = self.base.state_dict()
pretrained_dict = torch.load(checkpoint)
new_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict.keys()}
model_dict.update(new_dict)
print('Total : {}, update: {}'.format(len(pretrained_dict), len(new_dict)))
self.base.load_state_dict(model_dict)
print("loaded finished!")
def bh(self, grad_):
grad_.detach_()
print("max:%.8f, min:%.8f, std:%.8f, mean:%.8f" % (grad_.abs().max().item(), grad_.abs().min().item(), grad_.abs().std().item(), grad_.abs().mean().item()))
del grad_