I am trying to train a neural network using the first 5 layers of VGG16 as a feature extractor with Imagenet pre-trained weights. These layers contain 5 Max pooling layers which reduce images to 1/32th of original image size. For images with size 640x360, I have activation maps of 20x11 and 512 channels.
I want to classify each activation between 20 classes, taking as input the 512 channels by creating a 1x1 convolution created like this:
# Model definition
class BlockWiseCSRNet(nn.Module):
"""
Implementation of paper Counting objects by clockwise classification
Implementation of a method with CRSNet backend
"""
def __init__(self, load_weights=False, count_levels=10):
super(BlockWiseCSRNet, self).__init__()
self.seen = 0
self.backend_feat = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
self.backend = make_layers(self.backend_feat)
self.count_levels = count_levels
self.classification = nn.Sequential(
nn.Conv2d(in_channels=512, out_channels=count_levels, kernel_size=1),
nn.Softmax(dim=1)
)
if not load_weights:
mod = models.vgg16(pretrained=True)
self._initialize_weights()
# copy weights from VGG pretrained to frontend
for k, v in self.backend.state_dict().items():
if 'weight' in k:
self.backend.state_dict()[k].data[:] = mod.state_dict()['features.' + k][1].data[:]
else:
self.backend.state_dict()[k].data = mod.state_dict()['features.' + k][1].data
def freeze_backend(self):
"""
Freeze backend parameters to make it not trainable
:return:
"""
for p in self.backend.parameters():
p.requires_grad = False
def unfreeze_classification(self):
for p in self.classification.parameters():
p.requires_grad = True
def forward(self, x):
x = self.backend(x)
x = self.classification(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.normal_(m.weight, std=0.01)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def make_layers(cfg, in_channels=3, batch_norm=False, dilation=False):
if dilation:
d_rate = 2
else:
d_rate = 1
layers = []
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=d_rate, dilation=d_rate)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
### Relevant train loop
model = BlockWiseCSRNet(count_levels=20)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), args.learning_rate)
# train loop step
def train_step(epoch, device, data_loader, model, criterion, optimizer):
print('train step!')
running_loss = 0.0
# set model to train mode
model.train()
# only train classification layers, keep weights of backend
model.freeze_backend()
model.unfreeze_classification()
for mini_batch, (batch, labels, filenames) in enumerate(data_loader):
# Transfer to GPU
batch = batch.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# process batch
outputs = model(batch)
loss = criterion(outputs, labels)
a = list(model.classification.parameters())[0].clone()
loss.backward()
optimizer.step()
b = list(model.classification.parameters())[0].clone()
print('weights updated? {}'.format(not torch.equal(a.data, b.data)))
#print(torch.sum(model.classification[0].weight.sum()))
# print statistics every print_stats_steps mini-batches
running_loss += loss.item()
print('[{}] total train loss: {}'.format(epoch, running_loss))
This solution is able to process images and give output activations with proper sizes.
However, I am not able to train the network. I am using CrossEntropyLoss and Adam optimizer but output maps are always constant (even after multiple training iterations)
# batch is an image
batch.shape -> torch.Size([1, 3, 360, 640])
output = model(batch)
output.shape -> torch.Size([1, 20, 11, 20])
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), args.learning_rate)
_, predicted_class_map = torch.max(output, dim=1)
# predicted class map is always 0
I have tried several learning rates and configurations, but the predicted class map is always 0. I debugged weight updating and updates are always 0 (before and after optimizer step weights are equal)
What I am doing wrong? I am not sure if the problem is at the architecture or in the training loop (which I took from PyTorch tutorials)
EDIT: After inspecting the content of net output, I think that the problem is that all channels in output have the same values for all image. So:
all_means = [ outputs[0, t, :, :].mean().item() for t in range(0, outputs.shape[1])]
has a vector with all positions equal.
Thanks