Hello,
I receive Nan values for the cost function from the first epoch. could you please tell me what is going wrong? I define the network as below.
class MyNet(nn.Module): def __init__(self, extractor): super(MyNet, self).__init__() self.features = nn.Sequential( # Select Feature *list(extractor.children())[:-2] ) self.maxpool1 = nn.MaxPool2d(2,2) self.conv1 = nn.Conv2d(512,1024,3,padding=1) self.batchNorm1 = nn.BatchNorm2d(1024) self.conv2 = nn.Conv2d(1024,512,1) self.batchNorm2 = nn.BatchNorm2d(512) self.conv3 = nn.Conv2d(512,1024,3,padding=1) self.batchNorm3 = nn.BatchNorm2d(1024) self.conv4 = nn.Conv2d(1024,512,1) self.batchNorm4 = nn.BatchNorm2d(512) self.conv5 = nn.Conv2d(512,1024,3,padding=1) self.batchNorm5 = nn.BatchNorm2d(1024) self.final = nn.Conv2d(1024,30,1) def forward(self, input): output = self.features(input) output = self.maxpool1(output) output = f.leaky_relu(self.batchNorm1(self.conv1(output)), 0.1) output = f.leaky_relu(self.batchNorm2(self.conv2(output)), 0.1) output = f.leaky_relu(self.batchNorm3(self.conv3(output)), 0.1) output = f.leaky_relu(self.batchNorm4(self.conv4(output)), 0.1) output = f.leaky_relu(self.batchNorm5(self.conv5(output)), 0.1) output = f.leaky_relu(f.dropout(output, p = 0.5)) output = self.final(output) return output
and here is the initialization:
resnet18 = torchvision.models.resnet18(pretrained=True)
net = MyNet(resnet18)
for param in net.features.parameters():
param.requires_grad = False
conv1Params = list(net.conv1.parameters())
conv2Params = list(net.conv2.parameters())
conv3Params = list(net.conv3.parameters())
conv4Params = list(net.conv4.parameters())
conv5Params = list(net.conv5.parameters())
convFinalParams = list(net.final.parameters())
conv1Params[0].data.normal_(0.0, 0.0002);
conv2Params[0].data.normal_(0.0, 0.0002);
conv3Params[0].data.normal_(0.0, 0.0002);
conv4Params[0].data.normal_(0.0, 0.0002);
conv5Params[0].data.normal_(0.0, 0.0002);
convFinalParams[0].data.normal_(0.0, 0.0002);
Here is the adam optimization initialization:
input = V(torch.randn(1,nc,imageSize[0], imageSize[1]))
parameters = (p for p in list(net.parameters())[-12:])
learning_rate = 1e-4
optimizer = optim.Adam(params = parameters, lr = learning_rate)
Could you tell where is the problem?
Edit:
I did below changes to my forward function:
> def forward(self, input):
output = self.features(input) print("........... %f"% (output.data.mean())) output = self.maxpool1(output) print("........... %f"% (output.data.mean())) output = f.leaky_relu(self.batchNorm1(self.conv1(output)),0.1) print("........... %f"% (output.data.mean())) output = f.leaky_relu(self.batchNorm2(self.conv2(output)),0.1)
output = f.leaky_relu(self.batchNorm3(self.conv3(output)),0.1) print("........... %f"% (output.data.mean())) output = f.leaky_relu(self.batchNorm4(self.conv4(output)),0.1) print("........... %f"% (output.data.mean())) output = f.leaky_relu(self.batchNorm5(self.conv5(output)),0.1) print("........... %f"% (output.data.mean())) output = f.dropout(output, p = 0.5) print("........... %f"% (output.data.mean())) output = self.final(output) # output = f.sigmoid(output) return output
And here is the outputs: (I have to say that I did backprop per one image)
(1,1) → Current Batch Loss:nan
… 0.893032
… 1.491872
… 0.180793
… nan
… nan
… nan
… nan
(1,2) → Current Batch Loss:nan
… 0.903442
… 1.534281
… 0.182008
… nan
… nan
… nan
… nan
(1,3) → Current Batch Loss:nan
… 0.896864
… 1.470025
… 0.180523
… nan
… nan
… nan
… nan
(1,4) → Current Batch Loss:nan
… 0.911260
… 1.501375
… 0.181454
… nan
… nan
… nan
… nan
(1,5) → Current Batch Loss:nan
… 0.897548
… 1.495423
… 0.181025
… nan
… nan
… nan
… nan
(1,6) → Current Batch Loss:nan
… 0.907124
… 1.515306
… 0.180970
… nan
… nan
… nan
… nan
(1,7) → Current Batch Loss:nan
… 0.894349
… 1.472500
… 0.180993
… nan
… nan
… nan
… nan
(1,8) → Current Batch Loss:nan
… 0.907916
… 1.535602
… 0.180869
… nan
… nan
… nan
… nan
(1,9) → Current Batch Loss:nan
… 0.889712
… 1.469340
… 0.180603
… nan
… nan
… nan
… nan
(1,10) → Current Batch Loss:nan
… 0.912330
… 1.530017
… 0.181718
… nan
… nan
… nan
… nan
(1,11) → Current Batch Loss:nan
… 0.916205
… 1.547421
… 0.181335
… nan
… nan
… nan
… nan
(1,12) → Current Batch Loss:nan
… 0.914901
… 1.538954
… 0.181181
… nan
… nan
… nan
… nan
(1,13) → Current Batch Loss:nan
… 0.910332
… 1.508362
… 0.180705
… nan
… nan
… nan
… nan
(1,14) → Current Batch Loss:nan
… 0.921174
… 1.557664
… 0.181560
… nan
… nan
… nan
… nan
(1,15) → Current Batch Loss:nan
… 0.905606
… 1.528833
… 0.181028
… nan
… nan
… nan
… nan
(1,16) → Current Batch Loss:nan
… 0.880896
… 1.449598
… 0.180272
… nan
… nan
… nan
… nan
(1,17) → Current Batch Loss:nan
… 0.897655
… 1.520722
… 0.180509
… nan
… nan
… nan
… nan
(1,18) → Current Batch Loss:nan
… 0.897704
… 1.495461
… 0.180581
… nan
… nan
… nan
… nan
(1,19) → Current Batch Loss:nan
… 0.921070
… 1.548392
… 0.180941
… nan
… nan
… nan
… nan