Unable to get non zero predictions

I have image data of size 410x1x657x1625 where 410 are the number of images. I have masks of same dimensions where a pixel value is 255 if its part of text or else 0.
Now i train my network with loss function SmoothL1Loss without sizeAverage, adding up the loss and then dividing by total number of pixels i.e. 410x1x657x1625, the per pixel loss turns out to be approximately 35.
But when I plot the predicted values or the predicted mask for the train data I get 0 for each pixel.
I can’t understand the problem.

Hi, it’s best to post your code as well - it makes it easier to help spot any problems.

Here’s my network:

class convNet(nn.Module):
    #constructor
    def __init__(self):
        super(convNet, self).__init__()
        #defining layers in convnet
        #input size=1*657*1625
        self.conv1 = nn.Conv2d(1,16, kernel_size=3,stride=1,padding=1)
        self.conv2 = nn.Conv2d(16,64, kernel_size=3,stride=1,padding=1)
        #self.bn1=nn.BatchNorm2d(32)
        
        self.conv3 = nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1)
        self.pconv1= nn.Conv2d(128,128, kernel_size=(3,3),stride=1,padding=(1,1))
        #self.bn2=nn.BatchNorm2d(64)
        self.pconv2= nn.Conv2d(128,128, kernel_size=(3,7),stride=1,padding=(1,3))
        self.pconv3= nn.Conv2d(128,128, kernel_size=(7,3),stride=1,padding=(3,1))
        
        self.conv4= nn.Conv2d(128,64,kernel_size=3,stride=1,padding=1)
        self.conv5= nn.Conv2d(64,1,kernel_size=3,stride=1,padding=1)
        
    def forward(self, x):
        x = nnFunctions.relu(self.conv1(x))
        x = nnFunctions.relu(self.conv2(x))
        x = nnFunctions.relu(self.conv3(x))
        #parallel conv
        x = nnFunctions.relu(self.pconv1(x)+self.pconv2(x)+self.pconv3(x))
        
        x = nnFunctions.relu(self.conv4(x))
        x = nnFunctions.relu(self.conv5(x))
        return x

Initialization:

net=convNet()
net.cuda()

Loss function:

def L1Loss(predicted,target):
    loss=Variable.abs(predicted-target).sum()
    return loss

Learning rate:
learning_rate=1e-10
Train function:

def train(train_loader,net,epochs,total_samples):
    global learning_rate
    prev_loss=0
    
    for epoch in range(int(epochs)): # loop over the dataset multiple times
        optimizer = optim.Adagrad(net.parameters(), lr=learning_rate,lr_decay=0.25,weight_decay=1e-4)
        running_loss = 0.0
        for i,data in enumerate(train_loader):
            inputs,labels=data
            # wrap them in Variable
            inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = L1Loss(outputs, labels)
            loss.backward()        
            
            
            optimizer.step()
            # print statistics
            running_loss += loss.data[0]
            cur_loss=loss.data[0]
            print('Batch '+str(i)+':'+str(cur_loss))
        running_loss=running_loss/26790000.0
        print('\t Iteration '+str(epoch)+':'+str(running_loss))
#         if(prev_loss<running_loss):
#             learning_rate/=10
        prev_loss=running_loss
    print('Finished Training')
    return net

Testing:

images, labels = dataiter.next()
net.cuda()
predicted = net(Variable(images).cuda())

dataiter is iterator on train loader

printing predicted.cpu() gives 0 for all values