I am working on my image captioning process. For evaluation, I am using the pycocoevalcap package (GitHub - salaniz/pycocoevalcap: Python 3 support for the MS COCO caption evaluation tools), for the Cider metrics, i am writing some tests. If there is only one pair of groundtruth and prediction, cider gives 0 even though they are completely the same. But if there is two or even more pair, it is not 0 anymore. Why is that? I use the code below to test.
'''
Need java 1.8
Package:
pycocoevalcap
'''
from bleu.bleu import Bleu
from meteor.meteor import Meteor
from rouge.rouge import Rouge
from cider.cider import Cider
from spice.spice import Spice
import os
import sys
import ssl
class Scorer():
def __init__(self,ref,gt):
self.ref = ref
self.gt = gt
print('setting up scorers...')
self.word_based_scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(),"METEOR"),
(Rouge(), "ROUGE_L"),
(Cider(), "CIDEr"),
# (Spice(), "SPICE"),
]
def compute_scores(self):
total_scores = {
"Bleu1":[],
"Bleu2":[],
"Bleu3":[],
"Bleu4":[],
"METEOR":[],
"ROUGE_L":[],
"CIDEr":[],
"SPICE":[]
}
for scorer, method in self.word_based_scorers:
#print('computing %s score...'%(scorer.method()))
# print(scorer)
# print(method)
score, scores = scorer.compute_score(self.ref, self.gt)
if type(method) == list:
# for sc, scs, m in zip(score, scores, method):
# print("----------Bleu----------")
# print("%s: %0.3f"%(m, sc))
total_scores["Bleu1"].append(score[0])
total_scores["Bleu2"].append(score[1])
total_scores["Bleu3"].append(score[2])
total_scores["Bleu4"].append(score[3])
else:
# print("----------Other----------")
# print("%s: %0.3f"%(method, score))
# print(f"method is {method} total is {total_scores}")
total_scores[method].append(score)
# print(total_scores)
# print('*****DONE*****')
return total_scores
def compute_scores_iterative(self):
total_scores = {
"Bleu1":[],
"Bleu2":[],
"Bleu3":[],
"Bleu4":[],
"METEOR":[],
"ROUGE_L":[],
"CIDEr":[],
"SPICE":[]
}
for key in self.ref:
curr_ref = {key:self.ref[key]}
curr_gt = {key:self.gt[key]}
for scorer, method in self.word_based_scorers:
#print('computing %s score...'%(scorer.method()))
# print(scorer)
# print(method)
#print(curr_gt)
score, _ = scorer.compute_score(curr_ref, curr_gt)
# print(score)
if type(method) == list:
# for sc, scs, m in zip(score, scores, method):
# print("----------Bleu----------")
# print("%s: %0.3f"%(m, sc))
total_scores["Bleu1"].append(score[0])
total_scores["Bleu2"].append(score[1])
total_scores["Bleu3"].append(score[2])
total_scores["Bleu4"].append(score[3])
else:
# print("----------Other----------")
# print("%s: %0.3f"%(method, score))
# print(f"method is {method} score is {score}")
total_scores[method].append(score)
# print(score)
# print('*****DONE*****')
# print(total_scores)
return total_scores
# for key,value in total_scores.items():
# print('{}:{}'.format(key,value))
if __name__ == '__main__':
import numpy as np
ssl._create_default_https_context = ssl._create_unverified_context # Use it to solve SSL
ref = {'1': ['go down the stairs all the way and stop at the bottom'],}
gen = {'1': ['go down the stairs all the way and stop at the bottom'],}
scorer = Scorer(ref,gen)
tol_scores = scorer.compute_scores()
print(tol_scores)