Loss is NaN from first epoch

Hi, I am pretty new to pytorch and I am trying to train classification model, I uploaded folders with data coresponding to 5 classes. It is for sign language recognition, I preprocessed data the same way people on kaggle with good accuracy did. I stacked hand and face landmarks coordinates into array of size 96x42x3 and I want to classify those arrays. I tried many different architectures, but I always got NaN as train and validation loss.

I tried gradient clipping, checked data if they dont contain any NaNs or infs, I tried setting lerning rate to zero for debugging but even then I got NaN loss after few steps. I also tried to print gradient values to check where is the problem, but it doesnt look like they are getting too small.

I then tried to set learning rate to higher value, I tried 0.1 and 0.5 and for some reason, that helped. That broke me, I dont understand what is wrong or what to try next. Can someone please explain to me what am I doing wrong and direct me to fixing this thing?

I am not sure what is important to share here, so I uploaded all my code along with 5 folders with data, I only deleted my wandb key in code. Everything is uploaded here ASL_signs - Google Drive

Here is getitem (maybe I am loading it wrong)

def __getitem__(self, idx):

        npy_path = self.npy_files[idx]
        label = self.labels[idx]
        sample = np.load(npy_path)
        sample = sample.astype(np.float32)
        sample = np.transpose(sample, (2, 0, 1))

        sample = (sample-sample.min())/(sample.max()-sample.min())

And here is Lightning module

class AslLitModelMatrix(pl.LightningModule):
def init(self, num_classes, learning_rate=1e-3):
super().init()
self.save_hyperparameters()
self.learning_rate = learning_rate
self.num_classes = num_classes

    # Define the CNN layers and fully connected layers according to the architecture
    self.layer1 = nn.Sequential(nn.Conv2d(3, 16, 3, stride=1, padding=1),
                                nn.MaxPool2d(2),
                                nn.BatchNorm2d(16))

    self.layer2 = nn.Sequential(nn.Conv2d(16, 32, 3, stride=1, padding=1),
                                nn.MaxPool2d(2),
                                )

    self.layer3 = nn.Sequential(nn.Conv2d(32, 64, 3, stride=1, padding=1),
                                nn.MaxPool2d(2),
                                nn.BatchNorm2d(64))

    self.layer4 = nn.Sequential(nn.Conv2d(64, 64, 3, stride=1, padding=1),
                                nn.MaxPool2d(2),
                                nn.BatchNorm2d(64))

    self.global_pool = nn.AdaptiveMaxPool2d(1)
    self.fc1 = nn.Linear(768, self.num_classes)
    self.drop = nn.Dropout(0.1)
    self.fc2 = nn.Linear(128, self.num_classes)

    self.accuracy = Accuracy(task="multiclass", num_classes=self.num_classes)
    self.learning_rate = learning_rate

def forward(self, x):
    x = self.layer1(x)
    x = self.drop(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)


    x = x.view(x.size(0), -1)
    x = F.relu(self.fc1(x))
    #x = self.drop(x)
    #x = self.fc2(x)

    return x

def training_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)
    self.log('train_loss', loss)
    self.log('train_acc', self.accuracy(y_hat, y), prog_bar=True)

    return loss

def validation_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)
    self.log('val_loss', loss, prog_bar=True)
    self.log('val_acc', self.accuracy(y_hat, y), prog_bar=True)
    return loss

def test_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)
    self.log('test_loss', loss, prog_bar=True)
    self.log('test_acc', self.accuracy(y_hat, y), prog_bar=True)
    return loss


def on_after_backward(self):
        """Called after backward() and before optimizers do anything."""
        for name, param in self.named_parameters():
            if param.grad is not None:
                print(f"Gradient of {name}: {param.grad.data}")
            else:
                print(f"No gradient for {name}")

def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=1e-3)

    def lambda_epoch(epoch):
        if epoch < 10:
            return 1.0
        elif 10 <= epoch < 20:
            return 0.5
        else:
            return 0.1

    scheduler_lr = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_epoch)
    scheduler_plateau = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5)

    return {
        'optimizer': optimizer,
        'lr_scheduler': {'scheduler': scheduler_lr, 'interval': 'epoch'},
        'lr_scheduler': {'scheduler': scheduler_plateau, 'interval': 'epoch', 'monitor': 'val_loss'},
        'gradient_clip_val': 0.5,
    }

I’m not going to run your code, but I’ll give you the steps I might take to debug the issue.

  1. Insert conditional if statements at various steps in the forward pass. I.e.:
if torch.isnan(x).any():
    print("NaN found")
  1. Apply torch.nan_to_num on inputs and labels.

  2. Check the data path and (custom) loss function for any divisions, that could result in divide by zero, and update with an appropriate method (i.e. adding a small epsilon).

Is your loss always NaN or maybe after a while? Is you loss NaN when only using a single batch of size 1?