I am ensembing two models with mean pooling but also want to weight the loss of each seperate model at the same time so the less accurate model will contribute less to the final prediction.
Here is a simple example of what I am trying to achieve.
class WeightLoss(nn.Module):
def __init__(self, n):
super().__init__()
self.n = n
self.param = nn.Parameter(torch.empty(n), requires_grad=True)
self.reset_parameters()
def reset_parameters(self):
nn.init.constant_(self.param, 1 / self.n)
def forward(self, x):
# make sure the params sum to 1
return torch.mul(x, F.softmax(self.param))
class Net2(nn.Module):
def __init__(self, modelA, modelB):
super(Net2, self).__init__()
self.modelA = modelA
self.modelB = modelB
self.fc1 = nn.Linear(10, 8)
self.fc2 = nn.Linear(10, 8)
self.weight_loss = WeightLoss(2)
def loss(self, x, y):
return F.cross_entropy(x, y)
def forward(self, x, y):
# individual outputs
output1 = self.fc1(self.modelA(x))
output2 = self.fc2(self.modelB(x))
# mean pooling
final_output = torch.mean(torch.stack([output1, output2]), dim=0)
final_loss = self.loss(final_output, y)
# weighted loss for individual models
loss1 = self.loss(output1, y)
loss2 = self.loss(output2, y)
weighted_loss = self.weight_loss(torch.stack([loss1, loss2]))
weighted_loss = torch.sum(weighted_loss)
# combine the 2 losses
final_loss = torch.add(final_loss, weighted_loss)
return final_loss
With the following architecture, final_loss
do decrease over time but the weights for weighing the losses for each model do not change over time or by very small margin.
Have I done anything inappropriate?