I try to implement a rather simple siamese network and a contrastive loss function. I use a pre-trained VGG16 as a backbone model and strip away the last ReLU and MaxPooling from the encoder. Then I add an adaptive pooling and a plain linear layer to generate the embedding vector.
To test my implementation, I pass random inputs and check if every parameter gets an update.
Problem: As one can see in the output of my MWE, the elements 25 and 27 of the parameter list don’t receive updates. I think these are the biases of the last convolution layer and the linear layer. I also checked the content of optimizer.param_groups[0][“params”][25].grad
and optimizer.param_groups[0][“params”][27].grad
. The gradients are all zero… Why is that?
Additional: If one input is bigger than 224 by 224, for instance input_1 = torch.randn(4, 3, 400, 224)
, the bias of the last convolutional strangely gets updated.
MWE using PyTorch 1.11.0:
import torch
import torchvision.models as models
import torch.nn.functional as F
class Siamese_VGG16(torch.nn.Module):
def __init__(self, num_elements_embedding_vector: int) -> None:
super().__init__()
encoder = models.vgg16(pretrained=True)
layers = list(encoder.features.children())[:-2]
encoder = torch.nn.Sequential(*layers)
self.model = torch.nn.Module()
self.model.add_module("encoder", encoder)
global_pool = torch.nn.AdaptiveAvgPool2d((7, 7))
self.model.add_module("pool", global_pool)
embedded_vector = torch.nn.Sequential(
torch.nn.Linear(25088, num_elements_embedding_vector),
)
self.model.add_module("embedding", embedded_vector)
def forward_once(self, x: torch.Tensor) -> torch.Tensor:
encoding = self.model.encoder(x)
pool = self.model.pool(encoding)
pool = pool.reshape(pool.shape[0], -1)
return self.model.embedding(pool)
def forward(self, input1: torch.Tensor, input2: torch.Tensor):
output1 = self.forward_once(input1)
output2 = self.forward_once(input2)
return output1, output2
def contrastive_loss(embedding_vec_1, embedding_vec_2, label):
negative_margin = 1.0
euclidean_distance = F.pairwise_distance(
embedding_vec_1, embedding_vec_2, keepdim=True
)
loss_contrastive = torch.mean(
(1 - label).unsqueeze(1) * torch.pow(euclidean_distance, 2)
+ (label).unsqueeze(1)
* torch.pow(torch.clamp(negative_margin - euclidean_distance, min=0.0), 2)
)
return loss_contrastive
model = Siamese_VGG16(128)
optimizer = torch.optim.Adam(
params=model.parameters(),
lr=0.0005,
)
loss_func = contrastive_loss
parameters_pre = [t.detach().clone() for t in optimizer.param_groups[0]["params"]]
input_1 = torch.randn(4, 3, 224, 224)
input_2 = torch.randn(4, 3, 224, 224)
label = torch.tensor([1, 0, 1, 0], dtype=torch.long)
# forward pass
output_1, output_2 = model(input_1, input_2)
loss = loss_func(output_1, output_2, label)
# clear gradients
optimizer.zero_grad()
# backward pass
loss.backward()
# update parameters
optimizer.step()
parameters_post = [t.detach().clone() for t in optimizer.param_groups[0]["params"]]
idx = 0
for t_pre, t_post in zip(parameters_pre, parameters_post):
if torch.equal(t_pre, t_post):
print(f"{idx} : Equal")
else:
print(f"{idx} : Different")
idx += 1