Problem with output when training model

Hello everybody, I’m training VGG16 model, when I load input data from CIFAR10/100. This is happen through model.

for inputs, targets in trainloader:
        if torch.cuda.is_available() and gpu:
            data, target = data.cuda(), target.cuda() 
        # Load the inputs and targets        
        optimizer.zero_grad()
        
        output = model(inputs)
        loss_ce   = F.cross_entropy(output, targets)

And my result

tensor(nan, grad_fn=<NllLossBackward0>) <- cross-entropy
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<MmBackward0>) <- output

I have check the input data, but that is normal. I think my problem is on model but I don’t know how to solve it.

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            stride = 1
            if x == 'A':
                layers += [nn.AvgPool2d(kernel_size=2, stride=2)]
            elif x == 'D':
                layers += [nn.Dropout(0.2)]
            elif x=='M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1, stride=stride, bias=False),
							nn.ReLU(inplace=True)
							]
                in_channels = x
        
        return nn.Sequential(*layers)

    
    def _make_fc_layers(self):
        layers = []
#        if self.vgg_name=='VGG16' & self.labels==1000:
        if self.vgg_name=='VGG9':
            layers += [nn.Linear(512*9, 4096, bias=False)]
        elif self.vgg_name=='VGG11':
            layers += [nn.Linear(512*9, 4096, bias=False)]
        elif self.vgg_name=='VGG13':
            layers += [nn.Linear(512, 4096, bias=False)]
        else:
            layers += [nn.Linear(4608, 4096, bias=False)]
        layers += [nn.ReLU(inplace=True)]
        layers += [nn.Dropout(0.5)]
        layers += [nn.Linear(4096, 4096, bias=False)]
        layers += [nn.ReLU(inplace=True)]
        layers += [nn.Dropout(0.5)]
        layers += [nn.Linear(4096, self.labels, bias=False)]
        
        return nn.Sequential(*layers)
tensor([[-0.0445,  0.1427, -0.0183,  ...,  0.0952,  0.0687,  0.0972],
        [-0.0007, -0.0532, -0.0123,  ...,  0.0760,  0.0193, -0.0148],
        [-0.0266,  0.0014, -0.0522,  ..., -0.0256,  0.0049,  0.0411],
        ...,
        [ 0.2513,  0.1034,  0.0779,  ...,  0.0360,  0.0267, -0.0220],
        [ 0.0625, -0.0081,  0.0907,  ..., -0.0979,  0.0311,  0.0794],
        [-0.0597,  0.0334,  0.0099,  ...,  0.0193, -0.0386,  0.0059]],
       grad_fn=<MmBackward0>)
tensor(2.3044, grad_fn=<NllLossBackward0>)
tensor([[ 4.6967e-01,  1.9761e-01, -4.4746e-01,  ..., -1.0909e-01,
          3.7677e-01, -5.5646e-02],
        [ 3.9633e-01,  2.9369e-01, -4.0507e-01,  ...,  1.5722e-01,
          2.6546e-01,  7.4541e-02],
        [ 5.7334e-01,  2.4784e-02, -5.0350e-01,  ...,  5.9695e-02,
          5.2707e-01, -7.8346e-02],
        ...,
        [ 5.2913e-01,  1.4142e-01, -7.9371e-01,  ...,  2.8638e-02,
          4.1665e-01,  1.9140e-01],
        [ 7.6531e-01,  2.0086e-01, -7.6414e-01,  ..., -4.9669e-02,
          6.9262e-01,  2.7740e-02],
        [ 6.0875e-01,  1.4864e-01, -5.5716e-01,  ..., -1.2479e-04,
          1.7511e-01, -4.6789e-02]], grad_fn=<MmBackward0>)
tensor(2.3279, grad_fn=<NllLossBackward0>)
tensor([[ 0.6177,  0.9962, -0.0098,  ..., -0.1540,  0.4561, -0.3313],
        [ 1.9157,  2.3699, -0.3133,  ...,  0.1561,  1.5718, -0.2482],
        [ 1.0155,  1.3725,  0.1844,  ..., -0.1980,  0.7665, -0.4477],
        ...,
        [ 0.9811,  2.9002, -0.1897,  ..., -0.1517,  1.6514, -0.8167],
        [ 1.1547,  1.7881, -0.4431,  ..., -0.4696,  0.8573, -1.0190],
        [ 1.4662,  3.3045,  0.2333,  ..., -0.2152,  0.6779, -0.4037]],
       grad_fn=<MmBackward0>)
tensor(2.6869, grad_fn=<NllLossBackward0>)
tensor([[ 0.6621,  0.1375,  0.0398,  ..., -0.4012, -0.0729, -0.2406],
        [ 1.5922,  0.2860,  0.2198,  ..., -1.1856, -0.3055, -0.3399],
        [ 0.3546,  0.0543,  0.0297,  ..., -0.2679, -0.0325, -0.1089],
        ...,
        [ 0.5963,  0.1590,  0.0642,  ..., -0.4578, -0.0248, -0.2189],
        [ 1.2239,  0.1536,  0.2906,  ..., -1.0098, -0.0974, -0.4257],
        [ 0.3951,  0.0091,  0.0517,  ..., -0.3781,  0.0067, -0.1069]],
       grad_fn=<MmBackward0>)
tensor(2.3455, grad_fn=<NllLossBackward0>)
tensor([[ 2.0563,  0.3959,  0.9565,  ..., -1.7976,  0.4246, -0.2335],
        [ 0.6608,  0.0258,  0.2657,  ..., -0.5602,  0.1665, -0.2382],
        [ 0.8100,  0.0242,  0.2837,  ..., -0.7123,  0.2756, -0.1894],
        ...,
        [ 0.2330,  0.0331,  0.1242,  ..., -0.2710,  0.0304, -0.0494],
        [ 2.1338,  0.2029,  0.6819,  ..., -1.8708,  0.4788, -0.7440],
        [ 0.5896,  0.0441,  0.2288,  ..., -0.5129,  0.0958, -0.2150]],
       grad_fn=<MmBackward0>)
tensor(2.3971, grad_fn=<NllLossBackward0>)
tensor([[ 1.2402e+01, -1.0644e-01,  3.8001e+00,  ..., -1.0851e+01,
          3.0740e+00, -1.5251e+00],
        [ 4.3619e-01,  4.6270e-02,  1.3836e-01,  ..., -3.9096e-01,
          1.6142e-01, -2.6632e-02],
        [ 3.2662e+01,  2.8224e+00,  1.0777e+01,  ..., -2.9808e+01,
          1.2905e+01, -7.9600e-01],
        ...,
        [ 6.0897e-01,  4.7144e-03,  2.0982e-01,  ..., -5.7752e-01,
          2.5597e-01, -2.7669e-02],
        [ 9.9927e+01,  7.2934e-01,  3.1927e+01,  ..., -8.4562e+01,
          3.5506e+01, -1.7959e+00],
        [ 4.8350e-01,  4.7005e-02,  1.6941e-01,  ..., -5.2937e-01,
          2.1783e-01, -5.2236e-02]], grad_fn=<MmBackward0>)
tensor(17.6842, grad_fn=<NllLossBackward0>)
tensor([[-9.2668e+03,  1.6498e+03,  1.3751e+03,  ...,  1.1747e+02,
          2.1815e+03,  1.8640e+03],
        [-8.6874e+03,  1.5934e+03,  1.2635e+03,  ...,  1.3284e+02,
          2.0474e+03,  1.9207e+03],
        [-3.4754e+03,  6.5272e+02,  5.2269e+02,  ...,  2.8452e+01,
          7.8794e+02,  7.7275e+02],
        ...,
        [-5.4736e+03,  9.4801e+02,  8.8032e+02,  ...,  3.4059e+01,
          1.3176e+03,  1.2020e+03],
        [-5.0371e+03,  8.1725e+02,  8.0006e+02,  ...,  6.0789e+01,
          1.2386e+03,  1.1186e+03],
        [-3.9461e+03,  6.7896e+02,  6.0550e+02,  ..., -6.5373e+00,
          9.8526e+02,  8.1984e+02]], grad_fn=<MmBackward0>)
tensor(1749.0946, grad_fn=<NllLossBackward0>)
tensor([[ 3.6088e+16,  3.3664e+16,  1.7687e+16,  ...,  3.1587e+16,
         -2.9089e+17,  2.8984e+16],
        [ 4.8691e+16,  4.6644e+16,  2.4133e+16,  ...,  4.4263e+16,
         -3.9887e+17,  3.9937e+16],
        [ 5.1361e+16,  4.9357e+16,  2.4861e+16,  ...,  4.5952e+16,
         -4.2707e+17,  4.5693e+16],
        ...,
        [ 1.2239e+17,  1.1587e+17,  6.4194e+16,  ...,  1.1051e+17,
         -1.0198e+18,  9.7618e+16],
        [ 8.4602e+16,  7.6394e+16,  4.1487e+16,  ...,  7.5197e+16,
         -6.8583e+17,  6.9281e+16],
        [ 1.1045e+17,  1.0620e+17,  5.7726e+16,  ...,  1.0014e+17,
         -9.1708e+17,  8.9059e+16]], grad_fn=<MmBackward0>)
tensor(6.3041e+16, grad_fn=<NllLossBackward0>)

Actually the output of the model significantly decreased but i don’t know what happenned.

Maybe your learning rate is too large as the current training diverges and the loss explodes which eventually creates the NaN output and loss.

1 Like