Hi, I wanted to do model distillation from the pretrained ResNet model in such a way that the embeddings that were close in the teacher model are also close in the student model. I check the performance of the model afterwards by checking the outputs from the teacher and student after softmax with SRCC against a certain dataset. Somehow my SRCC is around 0 though and I am not sure what I am doing wrong here.
Here is the training loop.
teacher_clip = clip_model.visual
student_clip = ModifiedResNet(layers=(2,2,2,2), output_dim=1024, heads=32, input_resolution=224, width=32)
teacher_clip = teacher_clip.to("cuda")
student_clip = student_clip.to("cuda")
max_epochs = 300
ep_log_interval = 1
lrn_rate = 0.005
cel_loss = nn.CosineEmbeddingLoss(reduction="mean")
optimizer = torch.optim.SGD(student_clip.parameters(), lr=lrn_rate)
teacher_clip.eval()
for param in teacher_clip.parameters():
param.requires_grad = False
for epoch in range(0, max_epochs):
epoch_loss = 0
for (batch_idx, batch) in enumerate(loader):
with torch.autocast("cuda"):
X = batch[0].to("cuda")
Y = teacher_clip(X)
optimizer.zero_grad()
oupt = student_clip(X)
y = torch.ones(Y.shape[0], device=torch.device("cuda:0"))
loss_val = cel_loss(oupt, Y, y)
epoch_loss += loss_val.item()
loss_val.backward()
optimizer.step()
if epoch % ep_log_interval == 0:
print("epoch = %4d loss = %0.4f" % (epoch, epoch_loss))
torch.save(student_clip.state_dict(), "./kd_model_1.pt")