Using multiple outputs as inputs of a second part network before backward

I have been trying to train the model below:

class CombinedModels(nn.Module):
    def __init__(self):
        self.channels = 32
        self.feature_extractor = Densenet_Feature()
        self.Regressor = Fully_Connected_Regressor(self.channels)

    def forward(self, x): # X would be of shape [batch size, same as self.channels, width, height]
        features = torch.zeros((x.shape[0], self.channels*100), device="cuda", dtype=torch.float16)
        for i in range(self.channels):
            features[:,i*100:(i+1)*100] = self.feature_extractor(x[:, i:i+1, :, :])
        return self.Regressor(features)

def Densenet_Feature():
    model = models.densenet161(weights= models.DenseNet161_Weights.DEFAULT)
    starting_weights = model.features[0].weight
    new_conv_layer = nn.Conv2d(1, 96, kernel_size=7, stride=2, padding=3, bias=False)[:,0,:,:] = (starting_weights[:,0,:,:]+ starting_weights[:,1,:,:] + starting_weights[:,2,:,:])/3

    model.features[0] = new_conv_layer

    # Compress to 100 features
    model.classifier = nn.Linear(2208, 100)

    return model

def Fully_Connected_Regressor(channels):
    model = nn.Sequential(nn.Linear(100*channels, 100), nn.Linear(100, 8))
    return model

The code runs fine but it doesn’t seem as the gradient is updating correctly.
I tested it passing the exact same data as training and testing and the model doesn’t seem to converge, while passing the same data in a more traditional way (as a 32 channel image) converges nicely.
I am assuming there is some issue with the concatenation of the outputs of the first Densenet before going over the classifier that breaks the backward pass in some way?