Model does not train, weights are not updated

Hallo,
I tried to train a combination model of RestNet and LSTM on several GPUs, but the loss and the weights did not change. I got confused and have no idea why. May some have an idea.

class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        resnet = load_pretrainednet()
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)

    def forward(self, x):      
        x1 = self.resnet(x)  
        x1 = x1.view(x1.size(0), -1)  
        
        print("Outside: input size", x.size(), "outputs_size", x1.size())

        return x1
    
class Combine(nn.Module):
    def __init__(self):
        super(Combine, self).__init__()
        self.cnn = ResNet()
        self.rnn = nn.LSTM(input_size=2048, hidden_size=21, num_layers=1, batch_first=True)
        self.linear = nn.Linear(21,21)

    def forward(self, x):
        batch_size, C, H, W = x.size()
        c_in = x.view(batch_size, C, H, W)
        c_out = self.cnn(c_in)
        r_in = c_out.view(batch_size, 1, -1)
        self.rnn.flatten_parameters()
        r_out, (h_n, h_c) = self.rnn(r_in)
        r_out2 = self.linear(r_out[:, -1, :])
        return r_out2



def train_net(net, data_loader, num_images):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.000001)

    running_loss = 0.0
    running_corrects = 0.0
    net.train()  # Set model to training mode
    run_count = 0
    current_images = 0
    name, old_lstm_weight = list(net.module.rnn.named_parameters())[0]     
    old_linear_weight = net.module.linear.weight


    for i, (inputs, labels, masks) in enumerate(data_loader, 1):

        print (run_count)
        torch.cuda.empty_cache()
        gc.collect() 
            #input = input.unsqueeze()
        inputs = torch.cat((inputs),0)
        labels = torch.cat((labels),0)
        
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward
        # track history if only in train
        print ('Current input length in total: {}'.format(inputs.size()[0]))
        if torch.cuda.is_available():
            print ('Current inputsize per GPU: {}'.format(np.ceil(inputs.size()[0]/torch.cuda.device_count())))
                          
        preds = net(inputs)
        
        for elem in nvsmi.get_gpu_processes():
            print(elem)

        labels[labels >= 0.5] = 1.0
        labels[labels < 0.5] = 0.0
        loss = criterion(preds, labels)
        
        loss.backward()
        
        for elm in net.module.parameters():
            print(elm[0][0])
            break
            
        optimizer.step()        
        for elm in net.module.parameters():
            print(elm[0][0])
            break
        
        preds = torch.sigmoid(preds)  # torch.Size([N, C]) e.g. tensor([[0., 0.5, 0.]])
        preds[preds >= 0.5] = 1.0
        preds[preds < 0.5] = 0.0

        accuracy = (preds == labels).sum()/(labels.size()[0] *  labels.size()[1] *100.0)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # statistics
        running_loss += loss.item() * inputs.size(0)
        running_corrects += accuracy
        
        del preds
        torch.cuda.empty_cache()
        gc.collect() 

        num_images += inputs.size(0)
        current_images += inputs.size(0)

        if run_count  % 10 == 9:    # every 1000 mini-batches...
            # ...log the running loss
            writer.add_scalar('Training/NEW_Runs_Loss', running_loss/current_images, num_images)
            writer.add_scalar('Training/NEW_Runs_Acuracy', running_corrects.double()/current_images, num_images)

        run_count += 1
        
        #print ('input size: ', len(inputs))
        #print ('label size: ', len(labels))
        #plot_classes_preds(net, inputs, labels, mean, std)
        #print(list(net.module.rnn)[0])
        
        name, lstm_weight = list(net.module.rnn.named_parameters())[0]
        linear_weight = net.module.linear.weight
        
        if torch.equal(lstm_weight, old_lstm_weight):    
            print(colored("LSTM weight didn't changed", 'red'))
        else:
            print(colored("LSTM weight changed", 'green'))
                             
        if torch.equal(linear_weight, old_linear_weight):
            print(colored("Linear weight didn't changed", 'red'))
        else: 
            print(colored("Linear weight changed", 'green'))

        old_lstm_weight = lstm_weight     
        old_linear_weight = linear_weight

        #print(net.module.linear.weight)
      return net, running_loss, running_corrects, optimizer, num_images

Print output::

Current input length in total: 90
Current inputsize per GPU: 45.0
Outside: input size torch.Size([45, 3, 224, 224]) outputs_size torch.Size([45, 2048])
Outside: input size torch.Size([45, 3, 224, 224]) outputs_size torch.Size([45, 2048])
pid: 3375727 | gpu_id: 0 | gpu_uuid: GPU-96de9d91-de41-4be2-6c12-280909e98722 | gpu_name: Tesla V100-SXM2-32GB | used_memory:  9689.0MB
pid: 3375727 | gpu_id: 1 | gpu_uuid: GPU-bbcd5e54-bdfe-7c4c-3d6b-8312ba354811 | gpu_name: Tesla V100-SXM2-32GB | used_memory:  8939.0MB
tensor([[-0.0124, -0.0049, -0.0047, -0.0125,  0.0765, -0.0013, -0.0930],
        [-0.0035, -0.0379, -0.0086,  0.1207,  0.1172,  0.2363,  0.0651],
        [ 0.0040,  0.0597,  0.0610,  0.0591,  0.0746,  0.1351,  0.1906],
        [ 0.1521, -0.0442, -0.1501, -0.2492, -0.2439, -0.1416,  0.1227],
        [ 0.0078,  0.0360, -0.0127, -0.2912, -0.3637, -0.2218,  0.0186],
        [ 0.0095,  0.0808,  0.2047,  0.1493,  0.0226, -0.0785, -0.0541],
        [-0.0052,  0.0481,  0.1400,  0.3045,  0.2305,  0.0612,  0.1152]],
       device='cuda:0', dtype=torch.float64, grad_fn=<SelectBackward>)
tensor([[-0.0124, -0.0049, -0.0047, -0.0125,  0.0765, -0.0013, -0.0930],
        [-0.0035, -0.0379, -0.0086,  0.1207,  0.1172,  0.2363,  0.0651],
        [ 0.0040,  0.0597,  0.0610,  0.0591,  0.0746,  0.1351,  0.1906],
        [ 0.1521, -0.0442, -0.1501, -0.2492, -0.2439, -0.1416,  0.1227],
        [ 0.0078,  0.0360, -0.0127, -0.2912, -0.3637, -0.2218,  0.0186],
        [ 0.0095,  0.0808,  0.2047,  0.1493,  0.0226, -0.0785, -0.0541],
        [-0.0052,  0.0481,  0.1400,  0.3045,  0.2305,  0.0612,  0.1152]],
       device='cuda:0', dtype=torch.float64, grad_fn=<SelectBackward>)
LSTM weight didn't changed
Linear weight didn't changed

Thanks

removing the linear layer and make a deep copy solve the problem