Hi, I am pretty new to pytorch and I am trying to train classification model, I uploaded folders with data coresponding to 5 classes. It is for sign language recognition, I preprocessed data the same way people on kaggle with good accuracy did. I stacked hand and face landmarks coordinates into array of size 96x42x3 and I want to classify those arrays. I tried many different architectures, but I always got NaN as train and validation loss.
I tried gradient clipping, checked data if they dont contain any NaNs or infs, I tried setting lerning rate to zero for debugging but even then I got NaN loss after few steps. I also tried to print gradient values to check where is the problem, but it doesnt look like they are getting too small.
I then tried to set learning rate to higher value, I tried 0.1 and 0.5 and for some reason, that helped. That broke me, I dont understand what is wrong or what to try next. Can someone please explain to me what am I doing wrong and direct me to fixing this thing?
I am not sure what is important to share here, so I uploaded all my code along with 5 folders with data, I only deleted my wandb key in code. Everything is uploaded here ASL_signs - Google Drive
Here is getitem (maybe I am loading it wrong)
def __getitem__(self, idx):
npy_path = self.npy_files[idx]
label = self.labels[idx]
sample = np.load(npy_path)
sample = sample.astype(np.float32)
sample = np.transpose(sample, (2, 0, 1))
sample = (sample-sample.min())/(sample.max()-sample.min())
And here is Lightning module
class AslLitModelMatrix(pl.LightningModule):
def init(self, num_classes, learning_rate=1e-3):
super().init()
self.save_hyperparameters()
self.learning_rate = learning_rate
self.num_classes = num_classes
# Define the CNN layers and fully connected layers according to the architecture
self.layer1 = nn.Sequential(nn.Conv2d(3, 16, 3, stride=1, padding=1),
nn.MaxPool2d(2),
nn.BatchNorm2d(16))
self.layer2 = nn.Sequential(nn.Conv2d(16, 32, 3, stride=1, padding=1),
nn.MaxPool2d(2),
)
self.layer3 = nn.Sequential(nn.Conv2d(32, 64, 3, stride=1, padding=1),
nn.MaxPool2d(2),
nn.BatchNorm2d(64))
self.layer4 = nn.Sequential(nn.Conv2d(64, 64, 3, stride=1, padding=1),
nn.MaxPool2d(2),
nn.BatchNorm2d(64))
self.global_pool = nn.AdaptiveMaxPool2d(1)
self.fc1 = nn.Linear(768, self.num_classes)
self.drop = nn.Dropout(0.1)
self.fc2 = nn.Linear(128, self.num_classes)
self.accuracy = Accuracy(task="multiclass", num_classes=self.num_classes)
self.learning_rate = learning_rate
def forward(self, x):
x = self.layer1(x)
x = self.drop(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
#x = self.drop(x)
#x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log('train_loss', loss)
self.log('train_acc', self.accuracy(y_hat, y), prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log('val_loss', loss, prog_bar=True)
self.log('val_acc', self.accuracy(y_hat, y), prog_bar=True)
return loss
def test_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log('test_loss', loss, prog_bar=True)
self.log('test_acc', self.accuracy(y_hat, y), prog_bar=True)
return loss
def on_after_backward(self):
"""Called after backward() and before optimizers do anything."""
for name, param in self.named_parameters():
if param.grad is not None:
print(f"Gradient of {name}: {param.grad.data}")
else:
print(f"No gradient for {name}")
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=1e-3)
def lambda_epoch(epoch):
if epoch < 10:
return 1.0
elif 10 <= epoch < 20:
return 0.5
else:
return 0.1
scheduler_lr = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_epoch)
scheduler_plateau = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5)
return {
'optimizer': optimizer,
'lr_scheduler': {'scheduler': scheduler_lr, 'interval': 'epoch'},
'lr_scheduler': {'scheduler': scheduler_plateau, 'interval': 'epoch', 'monitor': 'val_loss'},
'gradient_clip_val': 0.5,
}