Inplace error in model with multiple heads

I cant seem to remove all attempts at inplace calculations, something that my algorithm just cannot do. I am relatively new to pytorch, so Im not aware of all the underlying calculations. Any help in removing all inplace calculations would be appreciated.

I am building the model like this:
class OdlVGG16(nn.Module):

    def __init__(self, num_classes, input_shape, batch_size, learning_rate):
        super(OdlVGG16, self).__init__()
        self.layers = []
        self.classifiers = []
        self.optimizers = []
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        input_features = input_shape[0]
        feature_maps = [64, 64]#, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512]
        for i, feature_map in enumerate(feature_maps):
            # Make each convolutional layer
            if i == 1 or i == 3 or i == 6 or i == 9 or i == 12:
                input_shape[1]/=2
                input_shape[2]/=2
            self.layers.append(nn.Conv2d(in_channels=input_features,
                                         out_channels=feature_map,
                                         kernel_size=(3,3),
                                         stride=(1,1),
                                         padding=(1,1)))
            # Make a trainable classifier for each convolutional layer
            self.classifiers.append(nn.Sequential(
            nn.Linear(int(feature_map * input_shape[1] * input_shape[2]), 100),
            nn.ReLU(inplace=False),
            nn.Dropout(inplace=False),
            nn.Linear(100, self.num_classes),
            ))
            input_features = feature_map
            self.optimizers.append(torch.optim.Adam(self.classifiers[i].parameters(), lr=self.learning_rate))
        self.max_layers = len(self.layers)
        fill_values = [1/(self.max_layers+1) for i in range(self.max_layers)]
        self.alpha = Parameter(torch.full((self.max_layers, 1), (1/(self.max_layers+1))))

Forward and backward pass are computed as shown below. The update_weights function is called per iteration in the main section of the code.

    def forward(self, x):
        layer_connections = []
        classifier_connections = []
        layer_connections.append(F.relu(self.layers[0](x), inplace=False))
        for i in range(1, self.max_layers):
            if i == 1 or i == 3 or i == 6 or i == 9 or i == 12:
                layer_connections.append(F.max_pool2d(F.relu(self.layers[i](layer_connections[i-1]),inplace=False),2))
            else:
                layer_connections.append(F.relu(self.layers[i](layer_connections[i-1]),inplace=False))
        for i in range(self.max_layers):
            classifier_connections.append(self.classifiers[i](layer_connections[i].view(layer_connections[i].size(0),-1)))
        classifier_all_layers = torch.stack(classifier_connections)
        return classifier_all_layers

    def zero_grad(self):
        for i in range(self.max_layers):
            self.optimizers[i].zero_grad()
            self.layers[i].weight.grad.data = torch.zeros(self.layers[i].weight.grad.data.size())
            self.layers[i].bias.grad.data = torch.zeros(self.layers[i].bias.grad.data.size())
    
    def update_weights(self, inputs, labels, criterion):
        outputs_per_layer = self(inputs)
        loss_per_layer = []
        for output in outputs_per_layer:
            loss = criterion(output.view(self.batch_size, self.num_classes), labels.view(self.batch_size).long())
            loss_per_layer.append(loss)
        w = [None] * len(loss_per_layer)
        b = [None] * len(loss_per_layer)
        with torch.no_grad():
            for i in range(len(loss_per_layer)):
                loss_per_layer[i].backward(retain_graph=True)
                self.optimizers[i].step()
                # self.classifiers[i].weight.data -= self.learning_rate * self.alpha[i] * self.classifiers[i].weight.grad.data
                # self.classifiers[i].bias.data -= self.learning_rate * self.alpha[i] * self.classifiers[i].bias.grad.data
                for j in range(i+1):
                    if w[j] is None:
                        w[j] = self.alpha[i] * self.layers[j].weight.grad.data
                        b[j] = self.alpha[i] * self.layers[j].bias.grad.data
                    else:
                        w[j] += self.alpha[i] * self.layers[j].weight.grad.data
                        b[j] += self.alpha[i] * self.layers[j].bias.grad.data
                self.zero_grad()
            for i in range(len(loss_per_layer)):
                self.layers[i].weight.data -= self.learning_rate * w[i]
                self.layers[i].bias.data -= self.learning_rate * b[i]       
            for i in range(len(loss_per_layer)):
                self.alpha[i] *= torch.pow(self.b, loss_per_layer[i])
                self.alpha[i] = torch.max(self.alpha[i], self.s/self.max_layers)
        self.alpha = Parameter(self.alpha / torch.sum(self.alpha), requires_grad=False)
        real_output = torch.sum(torch.mul(
                self.alpha.view(self.max_layers, 1).repeat(1, self.batch_size).view(
                    self.max_layers, self.batch_size, 1), outputs_per_layer), 0)
        loss = criterion(real_output.view(self.batch_size, self.num_classes), labels.view(self.batch_size).long())
        return loss

Stack trace is as follows:

C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\autograd\__init__.py:130: UserWarning: Error detected in AddmmBackward. Traceback of forward call that caused the error:
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\main.py", line 157, in <module>
    main(arguments)
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\main.py", line 86, in main
    train(args, model, train_val_list)
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\train.py", line 36, in train
    loss = model.update_weights(inputs, labels, criterion)
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\models.py", line 122, in update_weights
    outputs_per_layer = self(inputs)
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\models.py", line 110, in forward
    classifier_connections.append(self.classifiers[i](layer_connections[i].view(layer_connections[i].size(0),-1)))
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\container.py", line 117, in forward
    input = module(input)
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\linear.py", line 93, in forward
    return F.linear(input, self.weight, self.bias)
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\functional.py", line 1690, in linear
    ret = torch.addmm(bias, input, weight.t())
 (Triggered internally at  ..\torch\csrc\autograd\python_anomaly_mode.cpp:104.)
  Variable._execution_engine.run_backward(
Traceback (most recent call last):
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\main.py", line 157, in <module>
    main(arguments)
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\main.py", line 86, in main
    train(args, model, train_val_list)
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\train.py", line 36, in train
    loss = model.update_weights(inputs, labels, criterion)
  File "C:\Users\fvanbeer\Documents\GitHub\msc\code\models.py", line 131, in update_weights
    loss_per_layer[i].backward(retain_graph=True)
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\tensor.py", line 221, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "C:\Users\fvanbeer\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\autograd\__init__.py", line 130, in backward
    Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [100, 10]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!