Having seen a paper talking about mining top 70% gradient for Backpropagation, I am wondering if this strategy can real help improve performance. Somebody call this Online Hard Example Mining (OHEM).
Attached below is my custom Cross_Entropy implementation for calculating top k percentage gradient for binary classification. I have tested it when top_k = 100% and the result is exactly like original nn.cross_entropy()
May I ask, is there a better way to achieve this goal ? Do you think this is a good practice ?
module
class topk_crossEntrophy(nn.Module):
def __init__(self, top_k=0.7):
super(topk_crossEntrophy, self).__init__()
self.loss = nn.NLLLoss()
self.top_k = top_k
self.softmax = nn.LogSoftmax()
return
def forward(self, input, target):
softmax_result = self.softmax(input)
loss = Variable(torch.Tensor(1).zero_())
for idx, row in enumerate(softmax_result):
gt = target[idx]
pred = torch.unsqueeze(row, 0)
cost = self.loss(pred, gt)
loss = torch.cat((loss, cost), 0)
loss = loss[1:]
if self.k == 1:
valid_loss = loss
index = torch.topk(loss, int(self.top_k * loss.size()[0]))
valid_loss = loss[index[1]]
return torch.mean(valid_loss)
Test
a = torch.randn((10,2))
a.normal_()
b = np.random.randint(2, size=10)
b = torch.from_numpy(b.astype(np.float32)).type(torch.LongTensor)
topk_loss = topk_crossEntrophy()
loss = topk_loss(Variable(a, requires_grad=True), Variable(b))