'ConvolutionBackward0' Returned Nan Values in its 0th Output

NavinKumarMNK · March 22, 2023, 1:39pm

Logs :

I am getting this error everytime in random training steps ranging from 300-3000

Pytorch-lightning Parameters:
[AUTOENCODER_TRAIN]
max_epochs = 500
min_epochs = 250
#accelerator = gpu
benchmark = True
weights_summary = full
precision = 16
gradient_clip_val = 5
auto_lr_find = True
auto_scale_batch_size = True
auto_select_gpus = True
check_val_every_n_epoch = 1
fast_dev_run = False
enable_progress_bar = True
detect_anomaly=True
accumulate_grad_batches=8
track_grad_norm=2
sync_batchnorm = True
limit_val_batches=0.01
limit_train_batches=0.01

DataLoader Code

'@Author:NavinKumarMNK'
# Add the parent directory to the path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))

import utils.utils as utils

# Import the required modules
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader, random_split
import cv2
import PIL
import numpy as np
from utils.preprocessing import ImagePreProcessing

class AutoEncoderDataset(Dataset):
    def __init__(self, batch_size:int, num_workers:int,
                    data_path, annotation_train) -> None:
        super(AutoEncoderDataset, self).__init__()
        self.data_path = data_path
        self.annotation_train = open(annotation_train, 
                                        'r').read().splitlines()
        self.batch_size = int(batch_size)
        self.num_workers = int(num_workers)
        self.preprocessing = ImagePreProcessing()

        self.index = 0

    def __len__(self):
        return len(self.annotation_train)

    def __getitem__(self, index:int):
        
        i=0
        while True:
            i+=1
            if index+i >= len(self.annotation_train):
                index = 0
            video_path = self.annotation_train[index+i]
            video_path = os.path.join(self.data_path, video_path) 
            
            cap = cv2.VideoCapture(video_path.strip())     
            count: int = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            
            print(count, i)
            if not (cap.isOpened() and cap.get(cv2.CAP_PROP_FRAME_COUNT) > 0):
                continue
            
            if count < self.batch_size:
                ret_frames = np.random.randint(0, count, count)
            else:
                ret_frames= np.random.randint(0, count, self.batch_size)
            
                
            frames = []
            original = []
            # Get random frame indexes for batch size
            for frame in ret_frames:
                cap.set(1, frame)
                ret, frame = cap.read()
                if ret:
                    frame = np.transpose(frame, (2, 0, 1))
                    frame = self.preprocessing.transforms(torch.from_numpy(frame))
                    frame = self.preprocessing.preprocess(frame)
                    frame = self.preprocessing.augumentation(frame)
                    framex = self.preprocessing.improve(frame)
                    frames.append(framex)
                    framey = self.preprocessing.noise(frame)
                    original.append(framey)


            X = torch.stack(frames, dim=0)
            y = torch.stack(original, dim=0)
            
            if(torch.isnan(X).any() or torch.isnan(y).any()):
                print("reported")
                continue
            else:
                break

        return X, y    
    
class AutoEncoderDataModule(pl.LightningDataModule):
    def __init__(self, batch_size:int, num_workers:int,
                    data_path, annotation_train) -> None:
        super(AutoEncoderDataModule, self).__init__()
        self.annotation_train = annotation_train
        self.batch_size = int(batch_size)
        self.num_workers = int(num_workers)
        self.data_path = data_path

    def setup(self, stage=None):
        full_dataset = AutoEncoderDataset(self.batch_size, self.num_workers,
                                           self.data_path, self.annotation_train)
        train_size = int(0.8 * len(full_dataset))
        val_size = int(0.1 * len(full_dataset))
        test_size = len(full_dataset) - train_size - val_size
        self.train_dataset, self.val_dataset, self.test_dataset = random_split(
            full_dataset, [train_size, val_size, test_size])
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=1, num_workers=self.num_workers,
                           shuffle=True, drop_last=True, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=1, num_workers=self.num_workers,
                           shuffle=True, drop_last=True, pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=1, num_workers=self.num_workers,
                           shuffle=True, drop_last=True, pin_memory=True)

if __name__ == '__main__':
    dataset_params = utils.config_parse('AUTOENCODER_DATASET')
    annotation_train = utils.dataset_image_autoencoder(
                            dataset_params['data_path'])
    dataset = AutoEncoderDataModule(**dataset_params, 
                    annotation_train=annotation_train)
    dataset.setup()
    
    train_loader = dataset.train_dataloader()
    from models.EfficientNetb3.AutoEncoder import AutoEncoder
    model = AutoEncoder().to('cuda:0').half()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    import time
    for i, (x, y) in enumerate(train_loader):
        x = x.view(x.size(1), x.size(2), x.size(3), x.size(4))
        y = y.view(y.size(1), y.size(2), y.size(3), y.size(4))
        out = model(x)
        print(out.shape)
    
        time.sleep(10)
        #train the model
        loss = F.mse_loss(out.to('cuda:0'), y.to('cuda:0'))
        loss.backward()
        
        #update the model
        optimizer.step()
        optimizer.zero_grad()

ptrblck · March 23, 2023, 2:10am

I don’t know what your training wrapper does and if Lightning is using e.g. mixed-precision training by default. If so, than note that invalid gradients are expected when amp is used with float16 and the GradScaler will skip the parameter updates in this iteration before decreasing the scaling factor.
In this case disable torch.autograd.detect_anomaly.
If that’s not the case and amp is not used, check if you are seeing this issue using a specific input batch and make sure the model inputs contain valid values.

NavinKumarMNK · March 27, 2023, 11:00am

U mean that detecting anomaly causes this crash?
yeah i’m using 16 precision training.
Can u sugguest me which is best method to be followed in the training converting the model weights optimizers to float16 or only the data and doing calculation … i.2 O2 / O3.
Which is best… i am doing a project training AutoEncoder on Surviellance images using Efficientb3 as encoder & normal transposeConv as decoder. Actually even the model is not training after few steps… loss is stagnant, and when i convert the model in tensorrt i am recieving an eror that says weights are 16 prec but the input is float and the slow_cpu_conv of Efficient net is cannot able to process it… eventhough i am running in gpu

Actually this error i am getting in all the models i tried, including LSTM, CoAtNet etc., at one point of time i am recieiving the same error in any of the part. regarding LSTM, i am recieving it in

ptrblck · March 27, 2023, 5:16pm

Yes, invalid gradients are expected to occur during mixed-precision training in float16 and anomaly detection will thus report false errors.
Why do you enable it in the first place? This mode is only used during debugging.

NavinKumarMNK · March 27, 2023, 5:48pm

I thought it would report the error before its happening and automatically solves it. Thanks a lot I will check and let it you know it works are not .
So do u have any idea for training the Efficientb3 model as encoder for surveillance videos

And do you have any idea why am i getting weights in 16 bit and data in 32 bit and if i convert the data i am getting slow_cpu_error …

ptrblck · March 27, 2023, 7:31pm

I don’t know why your parameters are in float16, but I would assume you are manually calling .half() on the model, which is not the recommended approach using amp as described in the docs.

NavinKumarMNK · March 28, 2023, 1:19am

Yeah but, the efficient-netb3 is a pytorch model not the pytorch-lightning model . Will it automatically sets their paramters type and device ?
Yes, so setting amp back-end as ‘apex’ and with mode=o0, o1, o2, o3 has any effect extra effects, which will very useful?
Then Slow_cpu_error is stating that when i use convert the pytorch model to onnx, its happening in the cpu. so 16bit is not supported in cpu calculations

ptrblck · March 28, 2023, 3:41am

PyTorch uses float32 as the default dtype and will not apply mixed-precision training automatically for you.
apex.amp is deprecated and shouldn’t be used anymore. Use the previously mentioned mixed-precision training via torch.amp instead.

NavinKumarMNK · March 28, 2023, 4:35pm

Without using any .half() and doing the entire model training with precision = 16 in pytorch lightning Trainer.
I am getting this error.

I found out “deepspeed_stage_1” is causing this error.
If i do training without pytorch lightning trainer() strategy is none i am able to start training and compiling it into onnx without any errors.
And if the gpu is full i.e cuda out of memory… i gave in try statement if i get this cuda error then save the model in onnx format… at the time the same error occurs means… when we do computation in cpu i am getting this error

Any suggestions to solve it?

ptrblck · March 29, 2023, 8:43am

Lightning still seems to call .half() on the model since the weight type is torch.cuda.HalfFloat as seen in the error message. I would still recommend to not use the deprecated apex.amp implementation and to avoid calling half on the model especially if you want to export the model to the CPU, which does not support all float16 operations and will fail.

NavinKumarMNK · March 29, 2023, 9:14am

Yeah I dont use apex.amp, i am using pytorch_lightning’s “precision=16”. I dont get any error while doing this. But when i do with strategy set to deepspeed_stage_1 and distribute the workload, i am getting the same error
And i am not exporting the model to cpu, it seems deep_speed is doing it.

ryanxu8959 · May 11, 2023, 8:09pm

invalid gradients are expected to occur during mixed-precision training in float16 and anomaly detection will thus report false errors

I am curious how does lightning handle invalid gradients in this case?

ptrblck · May 15, 2023, 9:43pm

I don’t know what exactly Lightning does for precision=16, but would assume it should be using the native torch.amp implementation.