Problem while checking gradient of a intermediate feature map

I want to check the gradient in a fully-connected layer (which is a feature map) w.r.t every image in a batch. Specifically, I want to print the norm of gradient of every image at a specific layer. So I used register_hook in my code:

class Net(nn.Module):
    def __init__(self):
        ...
        
    def forward(self, data, target):
        feat = self.model.forward(data)
        loss = self.loss.forward(feat, target)
        feat.register_hook(self.grad_norm)
        loss.backward()

    def grad_norm(self):
        def hook(grad):
            print(torch.norm(grad))
        return hook

However, as I run this code, error show up:

loss.backward()
File “/usr/local/lib/python2.7/dist-packages/torch/autograd/variable.py”, line 156, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
File “/usr/local/lib/python2.7/dist-packages/torch/autograd/init.py”, line 98, in backward
variables, grad_variables, retain_graph)
TypeError: ‘list’ object is not callable

Is there something wrong in my implementation? Thanks!

self.model(data), not self.model.forward(data).

Similar mistake for loss.

Thanks for the correction.

However, the same error still pops up. Do you have any idea about the error?

Thanks!

can you give a complete script that i can run?

OK, I refined my code and here it is. Line 54 is where I added the hook.

from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from time import gmtime, strftime
from torchvision import datasets, transforms
from torch.utils.data.sampler import Sampler

# Training settings
parser = argparse.ArgumentParser(description='PyTorch Face Rec Example')
parser.add_argument('--batch-size',         type=int, default=256, help='input batch size for training (default: 64)')
parser.add_argument('--epochs',             type=int, default=30, help='number of epochs to train (default: 10)')
parser.add_argument('--no-cuda',            action='store_true', default=False, help='enables CUDA training')
parser.add_argument('--n-gpu',              type=int, default=1, help='number of gpu(s) used')
parser.add_argument('--seed',               type=int, default=1, help='random seed (default: 1)')
parser.add_argument('--log-interval',       type=int, default=10, help='how many batches to wait before logging training status')
parser.add_argument('--feat-size',          type=int, default=512, help='how long is the extracted feature vector')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)


class TrainTestNet(object):
    def __init__(self, model, data_loader, val_loader=None, cuda=True, n_gpu=1):
        self.model = model
        self.grad_norm = []
        self.loss = LossBlock(model.feat_size)
        self.data_loader = data_loader
        if val_loader is not None:
            self.val_loader = val_loader
        self.is_cuda = cuda
        if self.is_cuda:
            self.model.cuda()
            self.loss.cuda()
            gpu_id = [i for i in range(n_gpu)]
            self.model = torch.nn.DataParallel(self.model, device_ids=gpu_id)
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)

    def train_step(self, epoch, log_interval=100):
        self.model.train()
        for batch_idx, (data, target) in enumerate(self.data_loader):
            if self.is_cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            self.optimizer.zero_grad()
            feat = self.model(data)
            loss = self.loss(feat, target)
            feat.register_hook(self.grad_norm)

            loss.backward()
            self.optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    strftime("%Y-%m-%d %H:%M:%S", gmtime())
                    + '\tTrain Epoch: {} [{}/{} ({:.0f}%)]\t'.format(epoch, batch_idx * len(data),
                                                                     len(self.data_loader.dataset),
                                                                     100. * batch_idx / len(self.data_loader))
                    + '\t'.join(['{}: {:.6f}'.format(key, self.loss.loss_value[key].data[0]) for key in
                                 self.loss.loss_value.keys()]))

    def grad_norm(self):
        def hook(grad):
            print(torch.norm(grad))
        return hook


# 3x3 Convolution
def conv3x3(in_channels, out_channels, stride=1, padding=1, bias_term=False):
    l = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=padding, bias=bias_term)
    if bias_term:
        nn.init.constant(l.bias.data, 0.0)
    return l

# conv3x3 + prelu
class Conv3x3Block(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, padding=1, bias_term=False):
        super(Conv3x3Block, self).__init__()
        self.conv = conv3x3(in_channels, out_channels, stride=stride, padding=padding, bias_term=bias_term)
        self.prelu = nn.PReLU()

    def forward(self, x):
        out = self.conv(x)
        out = self.prelu(out)
        return out

# Face Residual Block
class FaceResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(FaceResidualBlock, self).__init__()
        self.conv1 = Conv3x3Block(in_channels, out_channels)
        self.conv2 = Conv3x3Block(out_channels, out_channels)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        out += residual
        return out

# FaceResNet Module
class FaceResNet(nn.Module):
    def __init__(self, n_blocks, feat_size=512):
        super(FaceResNet, self).__init__()
        self.feat_size = feat_size
        self.in_channels = 64
        self.conv0a = Conv3x3Block(1, 32, padding=0, bias_term=True)
        self.conv0b = Conv3x3Block(32, self.in_channels, padding=0, bias_term=True)
        self.layer1 = self.make_layer(FaceResidualBlock, 128, n_blocks[0], stride=2)
        self.layer2 = self.make_layer(FaceResidualBlock, 256, n_blocks[1], stride=2)
        self.fc5 = nn.Linear(2304, self.feat_size, bias=True)

    def make_layer(self, block, out_channels, n_blocks, stride=2):
        layers = []
        for i in range(0, n_blocks):
            layers.append(block(self.in_channels, self.in_channels))
        if stride > 1:
            layers.append(Conv3x3Block(self.in_channels, out_channels, padding=0, bias_term=True))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv0a(x)
        out = self.conv0b(out)
        out = F.max_pool2d(out, 2)
        out = self.layer1(out)
        out = F.max_pool2d(out, 2)
        out = self.layer2(out)
        out = out.view(-1, 2304)
        out = self.fc5(out)
        return out


class LossBlock(nn.Module):
    def __init__(self, feat_size):
        super(LossBlock, self).__init__()
        n_class = 10
        self.loss_value = {}
        self.softmax = SoftmaxLoss(feat_size, n_class)

    def forward(self, x, y):
        self.loss_value['softmax'] = self.softmax(x, y)
        loss = self.loss_value['softmax']
        return loss

class SoftmaxLoss(nn.Module):
    def __init__(self, input_size, output_size, loss_weight=1.0):
        super(SoftmaxLoss, self).__init__()
        self.fc = nn.Linear(input_size, output_size, bias=False)
        self.loss_weight = loss_weight

    def forward(self, x, y):
        x = F.log_softmax(self.fc(x))
        self.prob = x
        return F.nll_loss(x, y).mul_(self.loss_weight)

def mnist_loader(batch_size, cuda=True):
    kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('mnist_example/data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('mnist_example/data', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    return train_loader, test_loader



# load data
train_loader, val_loader = mnist_loader(args.batch_size, cuda=args.cuda)
net_solver = TrainTestNet(FaceResNet([1, 1, 1, 1], args.feat_size), train_loader, val_loader,
                          cuda=args.cuda, n_gpu=args.n_gpu)
print(net_solver.model)

for epoch in range(1, args.epochs + 1):
    net_solver.train_step(epoch, log_interval=10)

self.grad_norm is a list.
I dont know why you are giving it to feat.register_hook(self.grad_norm).
register_hook takes functions / closures / lambdas.

http://pytorch.org/docs/master/autograd.html?highlight=register_hook#torch.autograd.Variable.register_hook