I’m new to machine learning and pytorch and any insight would be greatly appreciated. I’ve been playing around with the MNIST number recognition dataset and thought it would be interesting to train different models on the negative images.
One model in particular yielded interesting results. The model is just three fully connected layers with the ReLU activation. This is what the learning process looks like.
At first I wondered if this was an exploding gradient issue due to high activation values, but when testing with a model with the sigmoid activation function resulted more consistent results and smoother changes in loss, and from my understanding models with sigmoid activations are more susceptible to gradient explosion.
Is it just some bug in my code? Here’s the code snippet just in case.
class Network2(nn.Module):
def __init__(self):
super(Network2, self).__init__()
self.fc1 = nn.Linear(784, 200)
self.fc2 = nn.Linear(200, 100)
self.fc3 = nn.Linear(100, 10)
def forward(self, x):
x = torch.flatten(x, 1)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.fc2(x)
x = nn.functional.relu(x)
x = self.fc3(x)
x = nn.functional.relu(x)
return x
# traning code
eta = 0.1
epochs = 20
net2 = network.Network2()
print(net2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net2.parameters(), lr=eta)
net2.zero_grad()
val_accs = []
val_loss = []
train_accs = []
train_loss = []
grads = []
total_loss = 0.
total_acc = (0, 0)
for epoch in range(epochs):
print(f"epoch {epoch}")
for i, (X, y) in enumerate(train_loader):
net2.train() # training mode
output = net2(X)
loss = criterion(output, y)
total_loss += loss.item()
total_acc = tuple(a + b for a, b in zip(total_acc, accuracy(output, y)))
loss.backward()
if i + 1 == len(train_loader):
grads.append(net2.fc1.weight.grad.detach().numpy().flatten())
optimizer.step()
optimizer.zero_grad()
if i % 100 == 99: # for every 100th batch
net2.eval() # evaluation mode
with torch.no_grad():
total_val_loss = 0.
val_acc = (0, 0)
for j, (val_X, val_y) in enumerate(validation_loader):
# evaluation loss and accuracy check
output = net2(val_X)
val_acc = tuple(a + b for a, b in zip(val_acc, accuracy(output, val_y)))
total_val_loss += criterion(output, val_y).item()
print(f" evaluation accuracy: {100 * val_acc[0] / val_acc[1]}")
val_accs.append(val_acc)
val_loss.append(total_val_loss / len(validation_loader))
print(f" training accuracy: {100 * total_acc[0] / total_acc[1]}")
train_accs.append(total_acc)
train_loss.append(total_loss / 100)
total_loss = 0.
total_acc = (0, 0)