Custom Ensemble approach

Thank you. request you to please review the below model architecture.

as informed by input data has two part image and labels and there are 10 possible labels

class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB, nb_classes=10):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        # Remove last linear layer
        self.modelA.fc = nn.Identity()
        self.modelB.fc = nn.Identity()
        #self.modelA.fc = nn.Linear(2048,10)
        #self.modelB.fc = nn.Linear(2048,10)

        
        # Create new classifier
        self.classifier1 = nn.Linear(2048, nb_classes)
        self.classifier2 = nn.Linear(2048, nb_classes)

    def forward(self, x):
        x1 = self.modelA(x.clone())  # clone to make sure x is not changed by inplace methods
        x1 = x1.view(x1.size(0), -1)
        x2 = self.modelB(x)
        x2 = x2.view(x2.size(0), -1)
        x = torch.cat((x1, x2), dim=1)
        
        x1= self.classifier1(F.relu(x1))
        x2= self.classifier2(F.relu(x2))
        return x1,x2

# Train your separate models
# ...
# We use pretrained torchvision models here
modelA = inception_v3(pretrained=True,aux_logits=False)
modelB = inception_v3(pretrained=True,aux_logits=False)

# Freeze these models
for param in modelA.parameters():
    param.requires_grad_(False)

for param in modelB.parameters():
    param.requires_grad_(False)

# Create ensemble model
model = MyEnsemble(modelA, modelB)
x = torch.randn(10, 3, 299, 299)
print(x)
output = model(x)

Also , i have lot of doubts, that whether the code below which I developed in bits and pieces is correct or not, if you can please advise, if there is any possible errors.

import copy
#%%time
#Epoch:- number times that the learning algorithm will work through the entire training dataset.
loss_arr = []
loss_epoch_arr = []
max_epochs = 1
min_loss = 1000
best_model = None

loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(max_epochs):

    for i, data in enumerate(train_loader, 0):
        # enumerate() returns a tuple containing a count (from start which defaults to 0) and the values obtained from iterating over iterable.
        # get the inputs
        inputs, labels = data
        inputs, labels = inputs, labels
        labels_v = labels[:,0,:] #vowel
        labels_c = labels[:,1,:] #consonent
        _, actual_v = torch.max(labels_v.data, 1)
        _, actual_c = torch.max(labels_c.data, 1)
        opt.zero_grad() # Sets gradients of all model parameters to zero,so parameter update correctly
        
        outputs_v, outputs_c = model(inputs)
        loss_v = loss_fn(outputs_v, actual_v) #loss in vowel
        loss_c = loss_fn(outputs_c, actual_c) #loss in consonent
        loss = torch.add(loss_v,loss_c)
        loss.backward() #backward pass:- computes the derivative of the loss w.r.t. the parameters
        opt.step() #causes the optimizer to take a step based on the gradients of the parameters.
        
        if min_loss > loss.item():
            min_loss = loss.item()
            best_model = copy.deepcopy(model.state_dict()) #create a fully independent clone of the original object and all of its children.
        
        loss_arr.append(loss.item())
        
        del inputs, labels, outputs_v, outputs_c
        torch.cuda.empty_cache() ##to empty the unused memory after processing each batch
        
    loss_epoch_arr.append(loss.item())
        
    print('Epoch: %d/%d, Test acc: %0.2f, Train acc: %0.2f' % (epoch, max_epochs, evaluation(validation_loader), evaluation(train_loader)))
    
net.load_state_dict(best_model) #Selecting the best model
plt.plot(loss_epoch_arr)
plt.show()

You are not using the ensemble approach in the first code snippet but just two separate models, both with a new classifier. Besides that the code looks fine.

The second code snippet looks fine, too. I would remove the .data usage, as it might yield unwanted side effects, as well as the torch.cuda.empty_cache() call, as it would slow down your code without giving any benefits if you are not sharing the GPU with another process.

@ptrblck , Thank you. i am getting very low train/test accuracy . not sure why. I was wondering if architecture itself is wrong or something else. This looks more like a multi class multi label classification problem. as every output will have predicted vowel and consonant and total number of vowels and consonants are 10 in my input data set. do you think that above code is still fine. or I should change the approach.

I don’t think you are looking for an ensemble approach and would recommend to start with a separate models first and make sure they are working properly.
A good starter is to try to overfit a small dataset, e.g. just 10 samples, by playing around with hyperparameters and the model architecture, and make sure the model can overfit this dataset.
Once this is done, you can try to scale up the problem by using more data.

1 Like

@ptrblck , This is a great suggestion. Thank you. I will update in couple of days.

Hi @ptrblck , i followed your approach to create an ensemble model but the model is predicting only one output label(0). I have 2 output labels in my data (0,1). I tried changing the relu function to tanh as well. Could you help me with understanding where i am going wrong?

class Loss(torch.nn.modules.Module):
    def __init__(self, Wt1, Wt0):
        super(Loss, self).__init__()
        self.Wt1 = Wt1
        self.Wt0 = Wt0
    
    def forward(self, inputs, targets, phase):
        loss = - (self.Wt1[phase] * targets * inputs.log() + self.Wt0[phase] * (1 - targets) * (1 - inputs).log())
        return loss

class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB, nb_classes=2):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        # Remove last linear layer
        self.modelA.fc = nn.Identity()
        self.modelB.fc = nn.Identity()
    
        # Create new classifier
        self.classifier = nn.Linear(2048+1664, nb_classes)
    
    def forward(self, x):
        x1 = self.modelA(x.clone())  # clone to make sure x is not changed by inplace methods
        x1 = x1.view(x1.size(0), -1)
        x2 = self.modelB(x)
        x2 = x2.view(x2.size(0), -1)
        x = torch.cat((x1, x2), dim=1)
        x = self.classifier(F.tanh(x))
        return x

# Train your separate models
# We use pretrained torchvision models here
modelA = resnet50(pretrained=True)
modelB = densenet169(pretrained=True)

# Freeze these models
for param in modelA.parameters():
    param.requires_grad_(False)

for param in modelB.parameters():
    param.requires_grad_(False)

# Create ensemble model
model = MyEnsemble(modelA, modelB)
model = model.cuda()

criterion = Loss(Wt1, Wt0)
optimizer = torch.optim.SGD(list(modelA.parameters()) + list(modelB.parameters()),     lr=0.00001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, verbose=True)

##### Train model
model = train_model(model, criterion, optimizer, dataloaders, scheduler, dataset_sizes, num_epochs=5)

Based on your code you are using two pretrained models (modelA and modelB), freeze their parameters, and use a new linear layer as the classifier.
This workflow is correct so far, but then you are only passing the frozen parameters to the optimizer and not the new (randomly initialized) classifier:

optimizer = torch.optim.SGD(list(modelA.parameters()) + list(modelB.parameters()),     lr=0.00001)

Try to pass either all parameters or just the params of the classifier to the optimizer:

optimizer = torch.optim.SGD(model.parameters(), lr=1e-5)
# or
optimizer = torch.optim.SGD(model.classifier.parameters(), lr=1e-5)
3 Likes

Hi i am getting error like

RuntimeError: shape ‘[0, -1]’ is invalid for input of size 131072.

Any idea?

The error message points to an invalid view operation, where it seems you’ve specified the size of the fist dimension as 0:

x = torch.randn(10, 10)
x.view(0, -1)
> RuntimeError: shape '[0, -1]' is invalid for input of size 100

excuse me what is the name of this method … I mean I read that there are types of ensemble or methods to do that . what is the title of this methodology and can I ensemble three models in this way ?
Thanks in advance

I think you should find more information and use cases when looking for “model ensemble” or “stacked classifiers”. sklearn gives an overview here and also provides the StackedClassifier class.
Yes, you can stack many models into an ensemble and could also use multiple levels of stacking.
Ensemble methods are(were) often used in Kaggle competitions and I think you might find a lot of resources for this topic there. E.g you could browse through the winning solutions here and check which model(s) were used.

Thanks a lot for replying so the way that i should use to concatenate three models for example is stacked classifier ?
can i use

class MyEnsemble(nn.Module):

for three models ? like that

class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB, modelC):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.modelC= modelC

Yes, it would be one option and reuses the approach I’ve previously posted.

1 Like

thanks a lot, I will try now … can you please see your inbox

hello @ptrblck I have used this code snippet to ensemble two Models resnet18 and vgg16 however the performance was terrible, can you help me how to ensemble the two models, actually I wanted to aggregate the learned features from the convolutional blocks from both the models and pass it to the classifier(custom defined).

Code:
class MyEnsemble(nn.Module):
def init(self, modelA, modelB, nb_classes=4):
super(MyEnsemble, self).init()
self.modelA = modelA
self.modelB = modelB

    self.modelA.classifier = nn.Identity()
    self.modelB.fc = nn.Identity()
    
    self.classifier = nn.Linear(512+25088, nb_classes)
    
def forward(self, x):
    x1 = self.modelA(x.clone()) 
    x1 = x1.view(x1.size(0), -1)
    #print(x1.shape)
    x2 = self.modelB(x)
    x2 = x2.view(x2.size(0), -1)
    #print(x2.shape)
    x = torch.cat((x1, x2), dim=1)
    
    x = self.classifier(F.relu(x))
    return x

modelA = vgg16(pretrained=True)
modelB = resnet18(pretrained=True)

for param in modelA.parameters():
param.requires_grad_(False)

for param in modelB.parameters():
param.requires_grad_(False)

fused_model=MyEnsemble(modelA, modelB)

How were both models performing in isolation on your dataset and how does it compare to the new model?

@ptrblck I was careless with the optimizer(sgd) params instead of taking the ensembled models parameter I had taken vgg16’s parameter, I sorted out the issue thanks for the prompt though

I change the model A and B but I got the error
RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2000 and 2560x8)

I don’t know what you’ve changed but the error is usually raised by a linear layer when the expected in_features do not match the input activation features. Check which layer raises the error, check the activation shape, and make sure the feature dimensions match.

1 Like