Pytorch can't appply to regression problem

I create the same data loader and same data iter for keras、pytorch、mxnet with vgg network, the net init is all random and not use pre train model. But pytorch’s loss can’t go down when loss is about 200. At the same time, keras’s and mxnet’s loss go down about 20.
the pytorch code is here, I can’t belive every thing is right, could you help me:

class myModel(nn.Module):
    def __init__(self):
        super(myModel, self).__init__()
        self.features = list(make_model('vgg16', num_classes=1, pretrained=False, input_size=INPUT_SIZE).children())[0]
        self.globalavg = nn.AdaptiveAvgPool2d(1)
        self.head = nn.Sequential(nn.Linear(512, 1024, bias=True),
                                  nn.ReLU(), 
                                  nn.Linear(1024, 1, bias=True))
    def forward(self, x):
        x = self.features(x)
        x = self.globalavg(x)
        x = x.view(x.size(0), -1)
        x = self.head(x)
        return x
model = myModel()
model = model.cuda()
# train code copy from pytorch.org
def train_model(model, dataloaders, criterion, optimizer, num_epochs=21, is_inception=False):
    since = time.time()
    val_acc_history = []
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in tqdm(range(num_epochs)):
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            running_loss = 0.0
            running_corrects = 0
            # Iterate over data.
            bar = tqdm(dataloaders[phase])
            for inputs, labels in bar:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    if is_inception and phase == 'train':
                        # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
                        outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss = loss1
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                bar.set_description('loss: {:.4f}'.format(loss.item()))
                # statistics
                running_loss += loss.item() * inputs.size(0)
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            print('{} Loss: {:.4f}'.format(phase, epoch_loss))
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history
dataloaders = {'train':train_dl, 'val':valid_dl}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_model(model, dataloaders, nn.MSELoss(reduction='mean'), optim.Adam(model.parameters(),lr=0.001), 21)

net is as follows:

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 64, 375, 375]           1,792
              ReLU-2         [-1, 64, 375, 375]               0
            Conv2d-3         [-1, 64, 375, 375]          36,928
              ReLU-4         [-1, 64, 375, 375]               0
         MaxPool2d-5         [-1, 64, 187, 187]               0
            Conv2d-6        [-1, 128, 187, 187]          73,856
              ReLU-7        [-1, 128, 187, 187]               0
            Conv2d-8        [-1, 128, 187, 187]         147,584
              ReLU-9        [-1, 128, 187, 187]               0
        MaxPool2d-10          [-1, 128, 93, 93]               0
           Conv2d-11          [-1, 256, 93, 93]         295,168
             ReLU-12          [-1, 256, 93, 93]               0
           Conv2d-13          [-1, 256, 93, 93]         590,080
             ReLU-14          [-1, 256, 93, 93]               0
           Conv2d-15          [-1, 256, 93, 93]         590,080
             ReLU-16          [-1, 256, 93, 93]               0
        MaxPool2d-17          [-1, 256, 46, 46]               0
           Conv2d-18          [-1, 512, 46, 46]       1,180,160
             ReLU-19          [-1, 512, 46, 46]               0
           Conv2d-20          [-1, 512, 46, 46]       2,359,808
             ReLU-21          [-1, 512, 46, 46]               0
           Conv2d-22          [-1, 512, 46, 46]       2,359,808
             ReLU-23          [-1, 512, 46, 46]               0
        MaxPool2d-24          [-1, 512, 23, 23]               0
           Conv2d-25          [-1, 512, 23, 23]       2,359,808
             ReLU-26          [-1, 512, 23, 23]               0
           Conv2d-27          [-1, 512, 23, 23]       2,359,808
             ReLU-28          [-1, 512, 23, 23]               0
           Conv2d-29          [-1, 512, 23, 23]       2,359,808
             ReLU-30          [-1, 512, 23, 23]               0
        MaxPool2d-31          [-1, 512, 11, 11]               0
AdaptiveAvgPool2d-32            [-1, 512, 1, 1]               0
           Linear-33                 [-1, 1024]         525,312
             ReLU-34                 [-1, 1024]               0
           Linear-35                    [-1, 1]           1,025
================================================================
Total params: 15,241,025
Trainable params: 15,241,025
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 1.61
Forward/backward pass size (MB): 606.82
Params size (MB): 58.14
Estimated Total Size (MB): 666.57
----------------------------------------------------------------

Could you check the shape of your labels and see if it’s the same as the model output’s shape?

1 Like