Nan value for Cost function from the first epoch

Hello,
I receive Nan values for the cost function from the first epoch. could you please tell me what is going wrong? I define the network as below.

class MyNet(nn.Module):
def __init__(self, extractor):
    super(MyNet, self).__init__()
    self.features = nn.Sequential(
            # Select Feature
            *list(extractor.children())[:-2]
    )
    self.maxpool1 = nn.MaxPool2d(2,2)
    self.conv1 = nn.Conv2d(512,1024,3,padding=1)
    self.batchNorm1 = nn.BatchNorm2d(1024)
    self.conv2 = nn.Conv2d(1024,512,1)
    self.batchNorm2 = nn.BatchNorm2d(512)
    self.conv3 = nn.Conv2d(512,1024,3,padding=1)
    self.batchNorm3 = nn.BatchNorm2d(1024)
    self.conv4 = nn.Conv2d(1024,512,1)
    self.batchNorm4 = nn.BatchNorm2d(512)
    self.conv5 = nn.Conv2d(512,1024,3,padding=1)
    self.batchNorm5 = nn.BatchNorm2d(1024)
    self.final = nn.Conv2d(1024,30,1)
    
def forward(self, input):
    output = self.features(input)
    output = self.maxpool1(output)
    output = f.leaky_relu(self.batchNorm1(self.conv1(output)), 0.1)
    output = f.leaky_relu(self.batchNorm2(self.conv2(output)), 0.1)
    output = f.leaky_relu(self.batchNorm3(self.conv3(output)), 0.1)
    output = f.leaky_relu(self.batchNorm4(self.conv4(output)), 0.1)
    output = f.leaky_relu(self.batchNorm5(self.conv5(output)), 0.1)
    output = f.leaky_relu(f.dropout(output, p = 0.5))
    output = self.final(output)
    return output

and here is the initialization:

resnet18 = torchvision.models.resnet18(pretrained=True)
net = MyNet(resnet18)
for param in net.features.parameters():
param.requires_grad = False
conv1Params = list(net.conv1.parameters())
conv2Params = list(net.conv2.parameters())
conv3Params = list(net.conv3.parameters())
conv4Params = list(net.conv4.parameters())
conv5Params = list(net.conv5.parameters())
convFinalParams = list(net.final.parameters())
conv1Params[0].data.normal_(0.0, 0.0002);
conv2Params[0].data.normal_(0.0, 0.0002);
conv3Params[0].data.normal_(0.0, 0.0002);
conv4Params[0].data.normal_(0.0, 0.0002);
conv5Params[0].data.normal_(0.0, 0.0002);
convFinalParams[0].data.normal_(0.0, 0.0002);

Here is the adam optimization initialization:

input = V(torch.randn(1,nc,imageSize[0], imageSize[1]))
parameters = (p for p in list(net.parameters())[-12:])
learning_rate = 1e-4
optimizer = optim.Adam(params = parameters, lr = learning_rate)

Could you tell where is the problem?

Edit:
I did below changes to my forward function:
> def forward(self, input):

    output = self.features(input)
    print("........... %f"% (output.data.mean()))
    output = self.maxpool1(output)
    print("........... %f"% (output.data.mean()))
    output = f.leaky_relu(self.batchNorm1(self.conv1(output)),0.1)
    print("........... %f"% (output.data.mean()))
    output = f.leaky_relu(self.batchNorm2(self.conv2(output)),0.1)
    output = f.leaky_relu(self.batchNorm3(self.conv3(output)),0.1)
    print("........... %f"% (output.data.mean()))
    output = f.leaky_relu(self.batchNorm4(self.conv4(output)),0.1)
    print("........... %f"% (output.data.mean()))
    output = f.leaky_relu(self.batchNorm5(self.conv5(output)),0.1)
    print("........... %f"% (output.data.mean()))
    output = f.dropout(output, p = 0.5)
    print("........... %f"% (output.data.mean()))
    output = self.final(output)
    # output = f.sigmoid(output)
    return output

And here is the outputs: (I have to say that I did backprop per one image)

(1,1) -> Current Batch Loss:nan
… 0.893032
… 1.491872
… 0.180793
… nan
… nan
… nan
… nan
(1,2) -> Current Batch Loss:nan
… 0.903442
… 1.534281
… 0.182008
… nan
… nan
… nan
… nan
(1,3) -> Current Batch Loss:nan
… 0.896864
… 1.470025
… 0.180523
… nan
… nan
… nan
… nan
(1,4) -> Current Batch Loss:nan
… 0.911260
… 1.501375
… 0.181454
… nan
… nan
… nan
… nan
(1,5) -> Current Batch Loss:nan
… 0.897548
… 1.495423
… 0.181025
… nan
… nan
… nan
… nan
(1,6) -> Current Batch Loss:nan
… 0.907124
… 1.515306
… 0.180970
… nan
… nan
… nan
… nan
(1,7) -> Current Batch Loss:nan
… 0.894349
… 1.472500
… 0.180993
… nan
… nan
… nan
… nan
(1,8) -> Current Batch Loss:nan
… 0.907916
… 1.535602
… 0.180869
… nan
… nan
… nan
… nan
(1,9) -> Current Batch Loss:nan
… 0.889712
… 1.469340
… 0.180603
… nan
… nan
… nan
… nan
(1,10) -> Current Batch Loss:nan
… 0.912330
… 1.530017
… 0.181718
… nan
… nan
… nan
… nan
(1,11) -> Current Batch Loss:nan
… 0.916205
… 1.547421
… 0.181335
… nan
… nan
… nan
… nan
(1,12) -> Current Batch Loss:nan
… 0.914901
… 1.538954
… 0.181181
… nan
… nan
… nan
… nan
(1,13) -> Current Batch Loss:nan
… 0.910332
… 1.508362
… 0.180705
… nan
… nan
… nan
… nan
(1,14) -> Current Batch Loss:nan
… 0.921174
… 1.557664
… 0.181560
… nan
… nan
… nan
… nan
(1,15) -> Current Batch Loss:nan
… 0.905606
… 1.528833
… 0.181028
… nan
… nan
… nan
… nan
(1,16) -> Current Batch Loss:nan
… 0.880896
… 1.449598
… 0.180272
… nan
… nan
… nan
… nan
(1,17) -> Current Batch Loss:nan
… 0.897655
… 1.520722
… 0.180509
… nan
… nan
… nan
… nan
(1,18) -> Current Batch Loss:nan
… 0.897704
… 1.495461
… 0.180581
… nan
… nan
… nan
… nan
(1,19) -> Current Batch Loss:nan
… 0.921070
… 1.548392
… 0.180941
… nan
… nan
… nan
… nan

@Atcold, Do you have any idea?

I think you should print output.abs().max() rather than output.mean()
Also, I think 0.89 is a really large number as average.
Maybe try to add a Batchnorm layer after features.

Thanks I will do it and notify you if something will happen