NaN values obtained in loss.data[0]

class convNet(nn.Module):
    #constructor
    def __init__(self):
        super(convNet, self).__init__()
        #defining layers in convnet
        #input size=1*657*1625
        self.conv1 = nn.Conv2d(1,16, kernel_size=3,stride=1,padding=1)
        self.conv2 = nn.Conv2d(16,32, kernel_size=3,stride=1,padding=1)
        self.conv3 = nn.Conv2d(32,64, kernel_size=3,stride=1,padding=1)
        
        #Parallel rectangle and square convolution
        self.Pconv1=nn.Conv2d(64,32, kernel_size=(3,3),stride=1,padding=(1,1))
        self.Pconv2=nn.Conv2d(64,32, kernel_size=(3,7),stride=1,padding=(1,3))
        self.Pconv3=nn.Conv2d(64,32, kernel_size=(7,3),stride=1,padding=(3,1))
        
        #auxilary convolution
        
        self.conv6 = nn.Conv2d(32,8, kernel_size=3,stride=1,padding=1)
        self.conv7 = nn.Conv2d(8,1, kernel_size=3,stride=1,padding=1)
            
    def forward(self, x):
        x = nnFunctions.leaky_relu(self.conv1(x))
        x = nnFunctions.leaky_relu(self.conv2(x))
        x = nnFunctions.leaky_relu(self.conv3(x))
        x=nnFunctions.leaky_relu(self.Pconv1(x))+nnFunctions.leaky_relu(self.Pconv2(x))+nnFunctions.leaky_relu(self.Pconv3(x))
        x=nnFunctions.leaky_relu(self.conv6(x))
        x=nnFunctions.leaky_relu(self.conv7(x))
        return x

The above is my convNet class which takes input of 410x1x512x1024 dimension data and outputs a 410x1x512x1024 dimension data.
The data is 410 images grayscale so 1 channel and dimension of 512x1024.

I use loss functions:

criterion = nn.SmoothL1Loss()
The following is my train function:

def train(train_loader,net,criterion,epochs,total_samples,learning_rate):
    prev_loss=0
    optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
    
    for epoch in range(int(epochs)): # loop over the dataset multiple times
        running_loss = 0.0
        for i,data in enumerate(train_loader):
            inputs,labels=data
            # wrap them in Variable
            inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            
            loss = criterion(outputs, labels)
            loss.backward()        
            optimizer.step()
            
            # print statistics
            running_loss += loss.data[0]
            **print (i,running_loss)**
    print('Finished Training')
    return net

The print(i,running_loss``) in the train function prints some value for 3 batches and then just outputs nan for every batch afterwards.

Kindly help

it’s hard to tell where is wrong for me, Maybe wrong init, maybe too large learning rate or maybe other reasons. I would like to advice you to print every layer output to find more information. it’s also a good idea to add pdb.set_trace in forward

def forward(self, x):
        x = nnFunctions.leaky_relu(self.conv1(x))
        print x.data
        x = nnFunctions.leaky_relu(self.conv2(x))
        print x.data
        x = nnFunctions.leaky_relu(self.conv3(x))
        print x.data 
        x=nnFunctions.leaky_relu(self.Pconv1(x))+nnFunctions.leaky_relu(self.Pconv2(x))+nnFunctions.leaky_relu(self.Pconv3(x))
		print x.data
        x=nnFunctions.leaky_relu(self.conv6(x))
        x=nnFunctions.leaky_relu(self.conv7(x))
        return x

Even if my learning rate is high there should be increase in loss right but the code gives nan. I think the output should be a number which is greater than previous loss but not nan.

Is your loss printing out NaN from the beginning or are you getting numbers which constantly increase and become NaN eventually?

The numbers decrease for 3 batches of data and then I get nan values constantly for every next batch and next iteration.

Could you see whether the numbers rapidly increase for each iteration? Because I had a similar problem but I made a dumb mistake of not resetting the printed loss after n number of iterations and it just kept accumulating.

Did you manage to find a solution to this ? (If you still remember :slight_smile: )