Ensembles with pre-trained classification models for image retrieval

Hello guys,
I’m working on a image retrieval project and I want to do an ensemble with pre trained models (as instance resnet 18 and resnet 50). This is the snippet of code I have. Any ideas how I can do that?

import torch
import logging
import torchvision
from torch import nn

from model.layers import Flatten, L2Norm, GeM

        "resnet18": 512,
        "resnet50": 2048,
        "resnet101": 2048,
        "vgg16": 512,

class GeoLocalizationNet(nn.Module):
    def __init__(self, backbone, fc_output_dim):
        self.backbone, features_dim = get_backbone(backbone)
        self.aggregation = nn.Sequential(
                nn.Linear(features_dim, fc_output_dim),
    def forward(self, x):
        x = self.backbone(x)
        x = self.aggregation(x)
        return x

def get_backbone(backbone_name):
    if backbone_name.startswith("resnet"):
        if backbone_name == "resnet18":
            backbone = torchvision.models.resnet18(pretrained=True)
        elif backbone_name == "resnet50":
            backbone = torchvision.models.resnet50(pretrained=True)
        elif backbone_name == "resnet101":
            backbone = torchvision.models.resnet101(pretrained=True)
        for name, child in backbone.named_children():
            if name == "layer3":  # Freeze layers before conv_3
            for params in child.parameters():
                params.requires_grad = False
        logging.debug(f"Train only layer3 and layer4 of the {backbone_name}, freeze the previous ones")
        layers = list(backbone.children())[:-2]  # Remove avg pooling and FC layer
    elif backbone_name == "vgg16":
        backbone = torchvision.models.vgg16(pretrained=True)
        layers = list(backbone.features.children())[:-2]  # Remove avg pooling and FC layer
        for l in layers[:-5]:
            for p in l.parameters(): p.requires_grad = False
        logging.debug("Train last layers of the VGG-16, freeze the previous ones")
    backbone = torch.nn.Sequential(*layers)
    features_dim = CHANNELS_NUM_IN_LAST_CONV[backbone_name]
    return backbone, features_dim