RuntimeError: value cannot be converted to type float without overflow: (3.52033e-08,-1.14383e-08)

/home/v2m/anaconda3/envs/my_env3/lib/python3.7/site-packages/torch/nn/_reduction.py:46: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))
/home/v2m/anaconda3/envs/my_env3/lib/python3.7/site-packages/torch/nn/functional.py:2622: UserWarning: nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.
  warnings.warn("nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.")
0.4137219845991996
0.4524866500379986
0.5172407126352913
0.5623468630359362
Traceback (most recent call last):
  File "pytorch_segmentation_detection/recipes/pascal_voc/segmentation/psp_resnet_50_8s_synch_bn_train.py", line 405, in <module>
    optimizer.step()
  File "/home/v2m/anaconda3/envs/my_env3/lib/python3.7/site-packages/torch/optim/adam.py", line 107, in step
    p.data.addcdiv_(-step_size, exp_avg, denom)
RuntimeError: value cannot be converted to type float without overflow: (3.52033e-08,-1.14383e-08)

My script is below:

#!/usr/bin/env python
# coding: utf-8

# In[2]:


#get_ipython().run_line_magic('matplotlib', 'notebook')

import sys, os
sys.path.append("/home/v2m/projects/pytorch-segmentation-detection/")
sys.path.append("/home/v2m/projects/pytorch-segmentation-detection/synchronized_batchnorm/")
sys.path.insert(0, '/home/v2m/projects/pytorch-segmentation-detection/vision/')


import torch.nn as nn
import torchvision.models as models
import torch

from pytorch_segmentation_detection.datasets.pascal_voc import PascalVOCSegmentation

from pytorch_segmentation_detection.transforms import (ComposeJoint,
                                                       RandomHorizontalFlipJoint,
                                                       RandomScaleJoint,
                                                       CropOrPad,
                                                       ResizeAspectRatioPreserve)

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torchvision.transforms as transforms

import numbers
import random

from matplotlib import pyplot as plt

import numpy as np
from PIL import Image

from sklearn.metrics import confusion_matrix

def flatten_logits(logits, number_of_classes):
    """Flattens the logits batch except for the logits dimension"""
    
    logits_permuted = logits.permute(0, 2, 3, 1)
    logits_permuted_cont = logits_permuted.contiguous()
    logits_flatten = logits_permuted_cont.view(-1, number_of_classes)
    
    return logits_flatten

def flatten_annotations(annotations):
    
    return annotations.view(-1)

def get_valid_annotations_index(flatten_annotations, mask_out_value=255):
    
    return torch.squeeze( torch.nonzero((flatten_annotations != mask_out_value )), 1)


def adjust_learning_rate(optimizer, iteration):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    
    max_iteration = 13000.0
    
    multiplier = (1.0 - (iteration / max_iteration)) ** (0.9)
    
    lr = 0.0001 * multiplier
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr



from pytorch_segmentation_detection.transforms import RandomCropJoint


number_of_classes = 21

labels = range(number_of_classes)

train_transform = ComposeJoint(
                [
                    RandomHorizontalFlipJoint(),
                    RandomCropJoint(crop_size=(513, 513)),
                    #[ResizeAspectRatioPreserve(greater_side_size=384),
                    # ResizeAspectRatioPreserve(greater_side_size=384, interpolation=Image.NEAREST)],
                    
                    #RandomCropJoint(size=(274, 274))
                    # RandomScaleJoint(low=0.9, high=1.1),
                    
                    #[CropOrPad(output_size=(288, 288)), CropOrPad(output_size=(288, 288), fill=255)],
                    [transforms.ToTensor(), None],
                    [transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), None],
                    [None, transforms.Lambda(lambda x: torch.from_numpy(np.asarray(x)).long()) ]
                ])

trainset = PascalVOCSegmentation(download=False,
                                 joint_transform=train_transform,
                                 split_mode=1)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=4, drop_last=True)


valid_transform = ComposeJoint(
                [
                     [transforms.ToTensor(), None],
                     [transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), None],
                     [None, transforms.Lambda(lambda x: torch.from_numpy(np.asarray(x)).long()) ]
                ])


valset = PascalVOCSegmentation(train=False,
                               download=False,
                               joint_transform=valid_transform,
                               split_mode=1)


valset_loader = torch.utils.data.DataLoader(valset, batch_size=1,
                                            shuffle=False, num_workers=2)

train_subset_sampler = torch.utils.data.sampler.SubsetRandomSampler(range(904))
train_subset_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=1,
                                                   sampler=train_subset_sampler,
                                                   num_workers=2)


# Define the validation function to track MIoU during the training
def validate():
    
    fcn.eval()
    
    overall_confusion_matrix = None

    for image, annotation in valset_loader:

        image = Variable(image.cuda())
        logits = fcn(image)

        # First we do argmax on gpu and then transfer it to cpu
        logits = logits.data
        _, prediction = logits.max(1)
        prediction = prediction.squeeze(1)

        prediction_np = prediction.cpu().numpy().flatten()
        annotation_np = annotation.numpy().flatten()

        # Mask-out value is ignored by default in the sklearn
        # read sources to see how that was handled

        current_confusion_matrix = confusion_matrix(y_true=annotation_np,
                                                    y_pred=prediction_np,
                                                    labels=labels)

        if overall_confusion_matrix is None:


            overall_confusion_matrix = current_confusion_matrix
        else:

            overall_confusion_matrix += current_confusion_matrix
    
    
    intersection = np.diag(overall_confusion_matrix)
    ground_truth_set = overall_confusion_matrix.sum(axis=1)
    predicted_set = overall_confusion_matrix.sum(axis=0)
    union =  ground_truth_set + predicted_set - intersection

    intersection_over_union = intersection / union.astype(np.float32)
    mean_intersection_over_union = np.mean(intersection_over_union)
    
    fcn.train()

    return mean_intersection_over_union


def validate_train():
    
    fcn.eval()
    
    overall_confusion_matrix = None

    for image, annotation in train_subset_loader:

        image = Variable(image.cuda())
        logits = fcn(image)

        # First we do argmax on gpu and then transfer it to cpu
        logits = logits.data
        _, prediction = logits.max(1)
        prediction = prediction.squeeze(1)

        prediction_np = prediction.cpu().numpy().flatten()
        annotation_np = annotation.numpy().flatten()

        # Mask-out value is ignored by default in the sklearn
        # read sources to see how that was handled

        current_confusion_matrix = confusion_matrix(y_true=annotation_np,
                                                    y_pred=prediction_np,
                                                    labels=labels)

        if overall_confusion_matrix is None:


            overall_confusion_matrix = current_confusion_matrix
        else:

            overall_confusion_matrix += current_confusion_matrix
    
    
    intersection = np.diag(overall_confusion_matrix)
    ground_truth_set = overall_confusion_matrix.sum(axis=1)
    predicted_set = overall_confusion_matrix.sum(axis=0)
    union =  ground_truth_set + predicted_set - intersection

    intersection_over_union = intersection / union.astype(np.float32)
    mean_intersection_over_union = np.mean(intersection_over_union)
    
    fcn.train()

    return mean_intersection_over_union

class Resnet18_16s(nn.Module):
    
    
    def __init__(self, num_classes=1000):
        
        super(Resnet18_16s, self).__init__()
        
        # Load the pretrained weights, remove avg pool
        # layer and get the output stride of 16
        resnet18_16s = models.resnet50(fully_conv=True,
                                      pretrained=True,
                                      output_stride=8,
                                      remove_avg_pool_layer=True,
                                      additional_blocks=0)
        
        # Randomly initialize the 1x1 Conv scoring layer
        resnet18_16s.fc = nn.Conv2d(resnet18_16s.inplanes * 2, num_classes, 1)
        
        self.resnet18_16s = resnet18_16s
        
        self.reduction_pooled_1 = nn.Conv2d(resnet18_16s.inplanes, 512, 1)
        self.reduction_pooled_2 = nn.Conv2d(resnet18_16s.inplanes, 512, 1)
        self.reduction_pooled_3 = nn.Conv2d(resnet18_16s.inplanes, 512, 1)
        self.reduction_pooled_4 = nn.Conv2d(resnet18_16s.inplanes, 512, 1)
        
        self._normal_initialization(self.resnet18_16s.fc)
        
    def _normal_initialization(self, layer):
        
        layer.weight.data.normal_(0, 0.01)
        layer.bias.data.zero_()
        
    def forward(self, x):
        
        input_spatial_dim = x.size()[2:]
        
        x = self.resnet18_16s.conv1(x)
        x = self.resnet18_16s.bn1(x)
        x = self.resnet18_16s.relu(x)
        x = self.resnet18_16s.maxpool(x)

        x = self.resnet18_16s.layer1(x)
        x = self.resnet18_16s.layer2(x)
        x = self.resnet18_16s.layer3(x)        
        x = self.resnet18_16s.layer4(x)
        
        fcn_features_spatial_dim = x.size()[2:]
        
        pooled_1 = nn.functional.adaptive_avg_pool2d(x, 1)
        pooled_1 = self.reduction_pooled_1(pooled_1)
        pooled_1 = nn.functional.upsample_bilinear(pooled_1, size=fcn_features_spatial_dim)
        
        pooled_2 = nn.functional.adaptive_avg_pool2d(x, 2)
        pooled_2 = self.reduction_pooled_1(pooled_2)
        pooled_2 = nn.functional.upsample_bilinear(pooled_1, size=fcn_features_spatial_dim)
        
        pooled_3 = nn.functional.adaptive_avg_pool2d(x, 3)
        pooled_3 = self.reduction_pooled_1(pooled_3)
        pooled_3 = nn.functional.upsample_bilinear(pooled_1, size=fcn_features_spatial_dim)
        
        pooled_4 = nn.functional.adaptive_avg_pool2d(x, 6)
        pooled_4 = self.reduction_pooled_1(pooled_4)
        pooled_4 = nn.functional.upsample_bilinear(pooled_1, size=fcn_features_spatial_dim)
        
        x = torch.cat([x, pooled_1, pooled_2, pooled_3, pooled_4],
                      dim=1)
        
        x = self.resnet18_16s.fc(x)
        
        x = nn.functional.upsample_bilinear(input=x, size=input_spatial_dim)
        
        return x
    


# In[2]:


#get_ipython().run_line_magic('matplotlib', 'notebook')

from matplotlib import pyplot as plt


# Create the training plot
loss_current_iteration = 0
loss_history = []
loss_iteration_number_history = []

validation_current_iteration = 0
validation_history = []
validation_iteration_number_history = []

train_validation_current_iteration = 0
train_validation_history = []
train_validation_iteration_number_history = []
 
f, (loss_axis, validation_axis) = plt.subplots(2, 1)

loss_axis.plot(loss_iteration_number_history, loss_history)
validation_axis.plot(validation_iteration_number_history, validation_history, 'b',
                     train_validation_iteration_number_history, train_validation_history, 'r')

loss_axis.set_title('Training loss')
validation_axis.set_title('MIoU on validation dataset')

plt.tight_layout()


# In[3]:


from sync_batchnorm import SynchronizedBatchNorm2d, DataParallelWithCallback

def make_batchnorm_syncronized(module):
    
    for child_module_name, child_module in module.named_children():
        
        if isinstance(child_module, nn.BatchNorm2d):
            
            sync_bn = SynchronizedBatchNorm2d(child_module.num_features)
            sync_bn.weight = child_module.weight
            sync_bn.bias = child_module.bias
            sync_bn.running_var = child_module.running_var
            sync_bn.running_mean = child_module.running_mean
            module.__setattr__(child_module_name, sync_bn)

fcn = Resnet18_16s(num_classes=21)
fcn.apply(make_batchnorm_syncronized)
fcn = DataParallelWithCallback(fcn, device_ids=[0, 1])

fcn.cuda()
fcn.train()


criterion = nn.CrossEntropyLoss(size_average=False).cuda()

optimizer = optim.Adam(fcn.parameters(), lr=0.0001)


# In[ ]:


best_validation_score = 0
loss_current_iteration = 0

iter_size = 20

for epoch in range(1000):  # loop over the dataset multiple times

    running_loss = 0.0
    
    for i, data in enumerate(trainloader, 0):
        
        # get the inputs
        img, anno = data
        
        # We need to flatten annotations and logits to apply index of valid
        # annotations. All of this is because pytorch doesn't have tf.gather_nd()
        anno_flatten = flatten_annotations(anno)
        index = get_valid_annotations_index(anno_flatten, mask_out_value=255)
        anno_flatten_valid = torch.index_select(anno_flatten, 0, index)

        # wrap them in Variable
        # the index can be acquired on the gpu
        img, anno_flatten_valid, index = Variable(img.cuda()), Variable(anno_flatten_valid.cuda()), Variable(index.cuda())

        # zero the parameter gradients
        optimizer.zero_grad()
        
        adjust_learning_rate(optimizer, loss_current_iteration)


        # forward + backward + optimize
        logits = fcn(img)
        logits_flatten = flatten_logits(logits, number_of_classes=21)
        logits_flatten_valid = torch.index_select(logits_flatten, 0, index)
        
        loss = criterion(logits_flatten_valid, anno_flatten_valid)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += (loss.data.item() / logits_flatten_valid.size(0)) 
        if i % 2 == 1:
            
            
            loss_history.append(running_loss / 2)
            loss_iteration_number_history.append(loss_current_iteration)
            
            loss_current_iteration += 1
            
            loss_axis.lines[0].set_xdata(loss_iteration_number_history)
            loss_axis.lines[0].set_ydata(loss_history)

            loss_axis.relim()
            loss_axis.autoscale_view()
            loss_axis.figure.canvas.draw()
            
            loss_current_iteration += 1
            
            running_loss = 0.0
        
            
            
    current_validation_score = validate()
    validation_history.append(current_validation_score)
    validation_iteration_number_history.append(validation_current_iteration)

    validation_current_iteration += 1

    validation_axis.lines[0].set_xdata(validation_iteration_number_history)
    validation_axis.lines[0].set_ydata(validation_history)



    current_train_validation_score = validate_train()
    train_validation_history.append(current_train_validation_score)
    train_validation_iteration_number_history.append(train_validation_current_iteration)

    train_validation_current_iteration += 1

    validation_axis.lines[1].set_xdata(train_validation_iteration_number_history)
    validation_axis.lines[1].set_ydata(train_validation_history)


    validation_axis.relim()
    validation_axis.autoscale_view()
    validation_axis.figure.canvas.draw()

    # Save the model if it has a better MIoU score.
    if current_validation_score > best_validation_score:

        torch.save(fcn.state_dict(), 'resnet_50_psp_check.pth')
        best_validation_score = current_validation_score
        print(best_validation_score)
        
                

print('Finished Training')

How to solve this issue?