Hello dear friends
I have a model for image captioning that I have already trained with cross-entropy loss and now I want to fine-tune it with self-critical loss.
The problem is that even though the loss is calculated and the gradients are also obtained, the weights of the model are not updated. I am posting my code here along with the results I have printed so that if you dear friends know the solution to this problem, please help me.
In this code, I have printed the weights of the last fully Connected layer(fc weights) and the gradients (grads) so that you can understand what I mean.
Many thanks
cider_scorer = Cider()
batch_size = 50
best_cider = 0.0
ref_dict = {}
beam_dict = {}
score_dict = {}
greedy_dict = {}
loss_fine = AverageMeter()
for i in range(len(train_captions)):
image_id = train_captions[i][0]
ref_dict[image_id] = train_captions[i][1:]
image =‘/content/val2014/COCO_val2014_’ + ‘%012d.jpg’ % (image_id)
with torch.enable_grad():
model.train()
optimizer.zero_grad()
best_candidates , log_prob =model.decoder.beam_search_fine_tune(image,20,5)
beam_dict[image_id] = [best_candidates]
score_dict[image_id] = log_prob
with torch.no_grad():
model.eval()
greedy_candidates = model.decoder.generate_caption(image)
greedy_dict[image_id] = [greedy_candidates]
if i%batch_size==0:
cider_mean_beam, cider_scores_beam = cider_scorer.compute_score(ref_dict, beam_dict)
cider_scores_beam = torch.tensor(cider_scores_beam , requires_grad=True)
cider_mean_greedy, cider_scores_greedy = cider_scorer.compute_score(ref_dict, greedy_dict)
cider_scores_greedy = torch.tensor(cider_scores_greedy , requires_grad=False)
log_probs = torch.tensor(list(score_dict.values()) , requires_grad=False, dtype=torch.float64)
loss = -log_probs * (cider_scores_beam - cider_scores_greedy)
loss = loss.mean()
loss.backward()
print('grads:' , cider_scores_beam.grad)
optimizer.step()
print('fc weights:' , model.fc.weight)
loss_fine.update(loss.item())
print(" Step:{} loss: {:.5f}".format(i,loss_fine.avg))
ref_dict = {}
beam_dict = {}
score_dict = {}
greedy_dict = {}
RESULTS:
grads: tensor([7.2035], dtype=torch.float64)
fc weights: Parameter containing:
tensor([[-0.0658, -0.0054, 0.0393, …, -0.0193, -0.0362, 0.0145],
** [-0.0407, -0.0050, 0.0332, …, -0.0232, -0.0370, 0.0101],**
** [ 0.0271, 0.0122, -0.0103, …, 0.0303, 0.0279, -0.0095],**
** …,**
** [-0.0102, -0.0493, 0.0631, …, 0.0041, -0.0058, 0.0167],**
** [-0.0391, 0.0197, 0.0330, …, -0.0742, -0.0536, 0.0059],**
** [-0.0553, 0.0032, 0.0076, …, -0.0251, -0.0757, 0.0284]],**
** device=‘cuda:0’, requires_grad=True)**
Step:0 loss: 0.00000
grads: tensor([0.0987, 0.1617, 0.1408, 0.1553, 0.1510, 0.1693, 0.1722, 0.1452, 0.1334,
0.1598, 0.2074, 0.1429, 0.1497, 0.1009, 0.1359, 0.1432, 0.1511, 0.1559,
0.1161, 0.2133, 0.1689, 0.1176, 0.1463, 0.1935, 0.1553, 0.1334, 0.1636,
0.2197, 0.1411, 0.1761, 0.1962, 0.1250, 0.1676, 0.1787, 0.1486, 0.1145,
0.1514, 0.1565, 0.1688, 0.1365, 0.1928, 0.1920, 0.1523, 0.2411, 0.1501,
0.1378, 0.1699, 0.1207, 0.1291, 0.1639], dtype=torch.float64)
fc weights: Parameter containing:
tensor([[-0.0658, -0.0054, 0.0393, …, -0.0193, -0.0362, 0.0145],
** [-0.0407, -0.0050, 0.0332, …, -0.0232, -0.0370, 0.0101],**
** [ 0.0271, 0.0122, -0.0103, …, 0.0303, 0.0279, -0.0095],**
** …,**
** [-0.0102, -0.0493, 0.0631, …, 0.0041, -0.0058, 0.0167],**
** [-0.0391, 0.0197, 0.0330, …, -0.0742, -0.0536, 0.0059],**
** [-0.0553, 0.0032, 0.0076, …, -0.0251, -0.0757, 0.0284]],**
** device=‘cuda:0’, requires_grad=True)**
Step:50 loss: 0.39694
grads: tensor([0.1346, 0.1545, 0.1668, 0.1208, 0.1706, 0.1593, 0.1656, 0.1346, 0.1743,
0.1377, 0.1482, 0.1266, 0.1135, 0.1516, 0.1038, 0.2127, 0.1568, 0.1153,
0.1474, 0.1539, 0.1680, 0.0846, 0.1350, 0.1399, 0.1596, 0.1388, 0.1357,
0.1696, 0.1463, 0.1823, 0.1963, 0.2008, 0.1386, 0.1716, 0.1793, 0.1568,
0.1421, 0.1152, 0.1393, 0.1493, 0.2451, 0.1245, 0.1327, 0.2142, 0.1903,
0.1272, 0.1970, 0.1610, 0.1497, 0.1151], dtype=torch.float64)
fc weights: Parameter containing:
tensor([[-0.0658, -0.0054, 0.0393, …, -0.0193, -0.0362, 0.0145],
** [-0.0407, -0.0050, 0.0332, …, -0.0232, -0.0370, 0.0101],**
** [ 0.0271, 0.0122, -0.0103, …, 0.0303, 0.0279, -0.0095],**
** …,**
** [-0.0102, -0.0493, 0.0631, …, 0.0041, -0.0058, 0.0167],**
** [-0.0391, 0.0197, 0.0330, …, -0.0742, -0.0536, 0.0059],**
** [-0.0553, 0.0032, 0.0076, …, -0.0251, -0.0757, 0.0284]],**
** device=‘cuda:0’, requires_grad=True)**
Step:100 loss: 0.41882
grads: tensor([0.1674, 0.1708, 0.1115, 0.2100, 0.1815, 0.0907, 0.1855, 0.1660, 0.1070,
0.1508, 0.1388, 0.1935, 0.1250, 0.1475, 0.1303, 0.1371, 0.1636, 0.1295,
0.1458, 0.1122, 0.1448, 0.0852, 0.1405, 0.1677, 0.1202, 0.1023, 0.1091,
0.1314, 0.1440, 0.1685, 0.1826, 0.1709, 0.1684, 0.1191, 0.1917, 0.1619,
0.1429, 0.1311, 0.1433, 0.1142, 0.1843, 0.1243, 0.1682, 0.1173, 0.1360,
0.1643, 0.1051, 0.1427, 0.1548, 0.1506], dtype=torch.float64)
fc weights: Parameter containing:
tensor([[-0.0658, -0.0054, 0.0393, …, -0.0193, -0.0362, 0.0145],
** [-0.0407, -0.0050, 0.0332, …, -0.0232, -0.0370, 0.0101],**
** [ 0.0271, 0.0122, -0.0103, …, 0.0303, 0.0279, -0.0095],**
** …,**
** [-0.0102, -0.0493, 0.0631, …, 0.0041, -0.0058, 0.0167],**
** [-0.0391, 0.0197, 0.0330, …, -0.0742, -0.0536, 0.0059],**
** [-0.0553, 0.0032, 0.0076, …, -0.0251, -0.0757, 0.0284]],**
** device=‘cuda:0’, requires_grad=True)**
Step:150 loss: 0.31828
grads: tensor([0.1179, 0.1831, 0.1659, 0.1355, 0.1057, 0.1428, 0.1791, 0.1210, 0.1852,
0.1498, 0.1711, 0.1554, 0.1491, 0.1810, 0.1842, 0.1201, 0.1507, 0.1520,
0.1629, 0.1359, 0.1011, 0.1495, 0.1202, 0.1867, 0.1987, 0.1398, 0.1562,
0.1334, 0.1707, 0.1497, 0.1385, 0.1236, 0.1601, 0.1235, 0.1372, 0.1524,
0.1820, 0.1358, 0.1286, 0.1749, 0.1247, 0.2020, 0.1420, 0.1191, 0.1706,
0.1452, 0.1736, 0.1884, 0.0901, 0.2085], dtype=torch.float64)
fc weights: Parameter containing:
tensor([[-0.0658, -0.0054, 0.0393, …, -0.0193, -0.0362, 0.0145],
** [-0.0407, -0.0050, 0.0332, …, -0.0232, -0.0370, 0.0101],**
** [ 0.0271, 0.0122, -0.0103, …, 0.0303, 0.0279, -0.0095],**
** …,**
** [-0.0102, -0.0493, 0.0631, …, 0.0041, -0.0058, 0.0167],**
** [-0.0391, 0.0197, 0.0330, …, -0.0742, -0.0536, 0.0059],**
** [-0.0553, 0.0032, 0.0076, …, -0.0251, -0.0757, 0.0284]],**
** device=‘cuda:0’, requires_grad=True)**
Step:200 loss: 0.34465