Trying to understand what's going on here

I’m new to machine learning and pytorch and any insight would be greatly appreciated. I’ve been playing around with the MNIST number recognition dataset and thought it would be interesting to train different models on the negative images.

One model in particular yielded interesting results. The model is just three fully connected layers with the ReLU activation. This is what the learning process looks like.

At first I wondered if this was an exploding gradient issue due to high activation values, but when testing with a model with the sigmoid activation function resulted more consistent results and smoother changes in loss, and from my understanding models with sigmoid activations are more susceptible to gradient explosion.

Is it just some bug in my code? Here’s the code snippet just in case.

class Network2(nn.Module):
  def __init__(self):
    super(Network2, self).__init__()
    self.fc1 = nn.Linear(784, 200)
    self.fc2 = nn.Linear(200, 100)
    self.fc3 = nn.Linear(100, 10)
    
  def forward(self, x):
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = nn.functional.relu(x)
    x = self.fc2(x)
    x = nn.functional.relu(x)
    x = self.fc3(x)
    x = nn.functional.relu(x)
    return x
# traning code
eta = 0.1
epochs = 20

net2 = network.Network2()
print(net2)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net2.parameters(), lr=eta)
net2.zero_grad()

val_accs = []
val_loss = []
train_accs = []
train_loss = []
grads = []
    
total_loss = 0.
total_acc = (0, 0)

for epoch in range(epochs):
    print(f"epoch {epoch}")
    
    for i, (X, y) in enumerate(train_loader):
        net2.train() # training mode
        
        output = net2(X)
        loss = criterion(output, y)
        total_loss += loss.item()
        total_acc = tuple(a + b for a, b in zip(total_acc, accuracy(output, y)))
        
        loss.backward()
        
        if i + 1 == len(train_loader):
            grads.append(net2.fc1.weight.grad.detach().numpy().flatten())
        
        optimizer.step()
        optimizer.zero_grad()

        if i % 100 == 99: # for every 100th batch
            net2.eval() # evaluation mode
            
            with torch.no_grad():
                total_val_loss = 0.
                val_acc = (0, 0)
                for j, (val_X, val_y) in enumerate(validation_loader):
                    # evaluation loss and accuracy check
                    output = net2(val_X)
                    val_acc = tuple(a + b for a, b in zip(val_acc, accuracy(output, val_y)))
                    total_val_loss += criterion(output, val_y).item()
                print(f" evaluation accuracy: {100 * val_acc[0] / val_acc[1]}")
                val_accs.append(val_acc)
                val_loss.append(total_val_loss / len(validation_loader))
                
            print(f" training accuracy: {100 * total_acc[0] / total_acc[1]}")
            train_accs.append(total_acc)
            train_loss.append(total_loss / 100)
                       
            total_loss = 0.
            total_acc = (0, 0)

Here are a few more tests with the same model: