Hi @ptrblck
I’m using pre-trained Places365-resnet50 as a base model and added a new fc layer. Only the newly added fc layer is trained to classify sun attributes. So in one pass I can predict both places 365 categories and sun attributes.
Here is my model:
# the architecture to use
arch = 'resnet50'
# load the pre-trained weights
model_file = '%s_places365.pth.tar' % arch
if not os.access(model_file, os.W_OK):
weight_url = 'http://places2.csail.mit.edu/models_places365/' + '%s_places365.pth.tar' % arch
os.system('wget ' + weight_url)
model = models.__dict__[arch](num_classes=365)
checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
state_dict = {str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items()}
model.load_state_dict(state_dict)
class CustomizedResNet(nn.Module):
def __init__(self):
super(CustomizedResNet, self).__init__()
# Resnet 50 as base model
self.base_model = model
def hook_feature(module, input, output):
self.feature = output
self.base_model._modules.get('avgpool').register_forward_hook(hook_feature)
self.scene_attr_fc = nn.Linear(2048, 102)
# freeze weights
for param in self.base_model.parameters():
param.requires_grad = False
for param in self.scene_attr_fc.parameters():
param.requires_grad = True
def forward(self, x):
places365_output = self.base_model(x)
# compute scene attributes
# feed the outputs from avgpool to the new fc layer
attributes_output = self.feature.view(self.feature.size(0), -1)
attributes_output = self.scene_attr_fc(attributes_output)
return places365_output, attributes_output
customized_model = CustomizedResNet()
And I only pass the parameters of the new fc layer to the optimizer.
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = optim.SGD(customized_model.scene_attr_fc.parameters(), lr=learning_rate)
My training parts look like this:
torch.save(customized_model.state_dict(), 'before_training.pth')
for epoch in range(num_epochs):
customized_model.train()
for i, (inputs, labels) in enumerate(dataloader):
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
with torch.set_grad_enabled(True):
_, scene_attr_outputs = customized_model(inputs)
loss = criterion(scene_attr_outputs, labels)
loss.backward()
optimizer.step()
torch.save(customized_model.state_dict(), 'model_saved_at_epoch_%s.pth' % epoch)
And my testing parts look like this:
data_transforms = {
'test': transforms.Compose([
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
}
customized_model.load_state_dict(torch.load('model_saved_at_epoch_9.pth'))
customized_model.eval()
images_test = ['stone.jpg']
for img_path in images_test:
# load test image
img = Image.open(img_path).convert('RGB')
img = data_transforms['test'](img)
img = img.to(device)
# prediction
places365_outputs, scene_attr_outputs = customized_model.forward(img.unsqueeze(0))
# prediction for places365
# print(places365_outputs.shape) -> torch.Size([1, 365])
h_x = F.softmax(places365_outputs, 1).data.squeeze()
probs, idx = h_x.sort(0, True)
print('places 365 prediction on {}'.format(img_path))
for i in range(0, 5):
# classes stores all 365 labels
print('{:.3f} -> {}'.format(probs[i], classes[idx[i]]))
The problem is, if I use the models saved at different epoch to predict the same image, the prediction of places 365 is changing even if I already freeze all weights for places 365 branch.
For example,
If I use the model saved before any training happens, its prediction is
places 365 prediction on stone.jpg
0.298 -> coast
0.291 -> ocean
0.172 -> beach
0.067 -> ice_floe
0.051 -> sky
If I use model_saved_at_epoch_3.pth, it gives
places 365 prediction on stone.jpg
0.308 -> coast
0.259 -> ocean
0.132 -> beach
0.107 -> sky
0.041 -> cliff
model_saved_at_epoch_13.pth gives:
places 365 prediction on stone.jpg
0.295 -> coast
0.234 -> ocean
0.152 -> sky
0.111 -> beach
0.047 -> cliff
I even compared the weights of the base_model after every epoch to the original weights, and it looks like the weights didn’t change:
# before training occurs
original_weights = []
for name, param in customized_model.base_model.named_parameters():
original_weights.append(param.clone())
for epoch in range(num_epochs):
# training .....
max_abs_diff_sum = 0
idx = 0
for epoch_name, epoch_param in customized_model.base_model.named_parameters():
max_abs_diff_sum += (original_weights[idx] - epoch_param).abs().max()
idx += 1
print(max_abs_diff_sum) # all print tensor(0., device='cuda:0'). So I think the weights of base_model didnt change
Do you have any idea on why the probability distribution is changing even if I froze all weights for the places 365 branch (and it also looks like the base_model weights are the same)?
Thank you.