Hello, I am a begginer AI developer attempting to train my first model. I have gone through everything and can not figure out why my training loss will not go down. Going through it, I have found that it is because my gradient for the convolutional blocks are staying at 0.0 and I can not figure out why. Any help would be much appreciated
import torch
from torch import nn
from pathlib import Path
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import os
device = “cuda” if torch.cuda.is_available() else “cpu”
DATA_PATH = Path(“data/”)
imagePathList = list(DATA_PATH.glob(“//*”)) # Creates list of paths to ALL images
data_transform = transforms.Compose([
transforms.Resize(size=(64, 64)),
transforms.RandomHorizontalFlip(p=0.5),
transforms.ToTensor()
])
trainData = datasets.ImageFolder(root=DATA_PATH/“train”, transform=data_transform)
testData = datasets.ImageFolder(root=DATA_PATH/“test”, transform=data_transform)
classNames = trainData.classes
BATCH_SIZE = 1 # Adjusted for better training dynamics
trainDataLoader = DataLoader(dataset=trainData, batch_size=BATCH_SIZE, shuffle=True)
testDataLoader = DataLoader(dataset=testData, batch_size=BATCH_SIZE, shuffle=False)
class ASLModel(nn.Module):
def init(self, input_shape, hidden_units, output_shapes):
super().init()
self.convolutional_block1 = nn.Sequential(
nn.Conv2d(in_channels=input_shape, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.convolutional_block2 = nn.Sequential(
nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=hidden_units*32*32, # Adjusted for output size
out_features=output_shapes)
)
def forward(self, x):
x = self.convolutional_block1(x)
x = self.convolutional_block2(x)
x = self.classifier(x)
return x
def accuracy_fn(y_true, y_pred):
correct = torch.eq(y_true, y_pred).sum().item()
acc = (correct / len(y_pred)) * 100
return acc
model = ASLModel(input_shape=3, hidden_units=30, output_shapes=len(classNames))
lossFunction = nn.CrossEntropyLoss()
modelOptimizer = torch.optim.SGD(params=model.parameters(), lr=0.1) # Adjusted learning rate
def trainingModel(modelParam, dataLoader, optimizer, lossFN, accuracyFN, device=device):
modelParam.train()
trainingLoss = 0
trainingAcc = 0
modelParam.to(device)
for batch, (X, y) in enumerate(dataLoader):
X, y = X.to(device), y.to(device)
y_pred = modelParam(X)
loss = lossFN(y_pred, y)
trainingLoss += loss
#print(f"loss={loss}, trainingLoss={trainingLoss}“)
trainingAcc += accuracyFN(y_true=y, y_pred=y_pred.argmax(dim=1))
optimizer.zero_grad()
loss.backward()
for name, param in model.named_parameters():
if param.grad is not None:
print(f"Gradient for {name}: {param.grad.mean()}”)
else:
print(f"Gradient for {name} is None")
optimizer.step()
trainingLoss /= len(dataLoader)
trainingAcc /= len(dataLoader)
print(f"Training Loss: {trainingLoss:.5f}, Training acc: {trainingAcc:.2f}")
def testingModel(modelParam, dataLoader, lossFN, accFN, device=device):
modelParam.eval()
testLoss, testAcc = 0, 0
modelParam.to(device)
with torch.inference_mode():
for X, y in dataLoader:
X, y = X.to(device), y.to(device)
test_pred = modelParam(X)
testLoss += lossFN(test_pred, y).item()
testAcc += accFN(y_true=y, y_pred=test_pred.argmax(dim=1))
testLoss /= len(dataLoader)
testAcc /= len(dataLoader)
print(f"Testing Loss: {testLoss:.5f}, Testing acc: {testAcc:.2f}")
epochs = 1
for epoch in range(epochs):
print(f"Epoch {epoch+1}")
trainingModel(model, trainDataLoader, modelOptimizer, lossFunction, accuracy_fn, device)
testingModel(model, testDataLoader, lossFunction, accuracy_fn, device)