OK, I refined my code and here it is. Line 54 is where I added the hook.
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from time import gmtime, strftime
from torchvision import datasets, transforms
from torch.utils.data.sampler import Sampler
# Training settings
parser = argparse.ArgumentParser(description='PyTorch Face Rec Example')
parser.add_argument('--batch-size', type=int, default=256, help='input batch size for training (default: 64)')
parser.add_argument('--epochs', type=int, default=30, help='number of epochs to train (default: 10)')
parser.add_argument('--no-cuda', action='store_true', default=False, help='enables CUDA training')
parser.add_argument('--n-gpu', type=int, default=1, help='number of gpu(s) used')
parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, help='how many batches to wait before logging training status')
parser.add_argument('--feat-size', type=int, default=512, help='how long is the extracted feature vector')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
if args.cuda:
torch.cuda.manual_seed(args.seed)
class TrainTestNet(object):
def __init__(self, model, data_loader, val_loader=None, cuda=True, n_gpu=1):
self.model = model
self.grad_norm = []
self.loss = LossBlock(model.feat_size)
self.data_loader = data_loader
if val_loader is not None:
self.val_loader = val_loader
self.is_cuda = cuda
if self.is_cuda:
self.model.cuda()
self.loss.cuda()
gpu_id = [i for i in range(n_gpu)]
self.model = torch.nn.DataParallel(self.model, device_ids=gpu_id)
self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)
def train_step(self, epoch, log_interval=100):
self.model.train()
for batch_idx, (data, target) in enumerate(self.data_loader):
if self.is_cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
self.optimizer.zero_grad()
feat = self.model(data)
loss = self.loss(feat, target)
feat.register_hook(self.grad_norm)
loss.backward()
self.optimizer.step()
if batch_idx % log_interval == 0:
print(
strftime("%Y-%m-%d %H:%M:%S", gmtime())
+ '\tTrain Epoch: {} [{}/{} ({:.0f}%)]\t'.format(epoch, batch_idx * len(data),
len(self.data_loader.dataset),
100. * batch_idx / len(self.data_loader))
+ '\t'.join(['{}: {:.6f}'.format(key, self.loss.loss_value[key].data[0]) for key in
self.loss.loss_value.keys()]))
def grad_norm(self):
def hook(grad):
print(torch.norm(grad))
return hook
# 3x3 Convolution
def conv3x3(in_channels, out_channels, stride=1, padding=1, bias_term=False):
l = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=padding, bias=bias_term)
if bias_term:
nn.init.constant(l.bias.data, 0.0)
return l
# conv3x3 + prelu
class Conv3x3Block(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, padding=1, bias_term=False):
super(Conv3x3Block, self).__init__()
self.conv = conv3x3(in_channels, out_channels, stride=stride, padding=padding, bias_term=bias_term)
self.prelu = nn.PReLU()
def forward(self, x):
out = self.conv(x)
out = self.prelu(out)
return out
# Face Residual Block
class FaceResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super(FaceResidualBlock, self).__init__()
self.conv1 = Conv3x3Block(in_channels, out_channels)
self.conv2 = Conv3x3Block(out_channels, out_channels)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.conv2(out)
out += residual
return out
# FaceResNet Module
class FaceResNet(nn.Module):
def __init__(self, n_blocks, feat_size=512):
super(FaceResNet, self).__init__()
self.feat_size = feat_size
self.in_channels = 64
self.conv0a = Conv3x3Block(1, 32, padding=0, bias_term=True)
self.conv0b = Conv3x3Block(32, self.in_channels, padding=0, bias_term=True)
self.layer1 = self.make_layer(FaceResidualBlock, 128, n_blocks[0], stride=2)
self.layer2 = self.make_layer(FaceResidualBlock, 256, n_blocks[1], stride=2)
self.fc5 = nn.Linear(2304, self.feat_size, bias=True)
def make_layer(self, block, out_channels, n_blocks, stride=2):
layers = []
for i in range(0, n_blocks):
layers.append(block(self.in_channels, self.in_channels))
if stride > 1:
layers.append(Conv3x3Block(self.in_channels, out_channels, padding=0, bias_term=True))
self.in_channels = out_channels
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv0a(x)
out = self.conv0b(out)
out = F.max_pool2d(out, 2)
out = self.layer1(out)
out = F.max_pool2d(out, 2)
out = self.layer2(out)
out = out.view(-1, 2304)
out = self.fc5(out)
return out
class LossBlock(nn.Module):
def __init__(self, feat_size):
super(LossBlock, self).__init__()
n_class = 10
self.loss_value = {}
self.softmax = SoftmaxLoss(feat_size, n_class)
def forward(self, x, y):
self.loss_value['softmax'] = self.softmax(x, y)
loss = self.loss_value['softmax']
return loss
class SoftmaxLoss(nn.Module):
def __init__(self, input_size, output_size, loss_weight=1.0):
super(SoftmaxLoss, self).__init__()
self.fc = nn.Linear(input_size, output_size, bias=False)
self.loss_weight = loss_weight
def forward(self, x, y):
x = F.log_softmax(self.fc(x))
self.prob = x
return F.nll_loss(x, y).mul_(self.loss_weight)
def mnist_loader(batch_size, cuda=True):
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('mnist_example/data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('mnist_example/data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True, **kwargs)
return train_loader, test_loader
# load data
train_loader, val_loader = mnist_loader(args.batch_size, cuda=args.cuda)
net_solver = TrainTestNet(FaceResNet([1, 1, 1, 1], args.feat_size), train_loader, val_loader,
cuda=args.cuda, n_gpu=args.n_gpu)
print(net_solver.model)
for epoch in range(1, args.epochs + 1):
net_solver.train_step(epoch, log_interval=10)