PyTorch optimizer.step() function doesn't update weights using CrossEntropyLoss and SGD

dalexay16 · October 10, 2023, 3:27am

I am a beginner with PyTorch. I am following its quickstart guide and have replicated its code almost verbatim, but in my output, the model’s weights are not changing. I took a look at some other posts about the step function not working, but their suggestions didn’t seem to work for me. Would someone be able to point me in the right direction please?

Definition of custom module:

# Get hardware for training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512), # The choice of transformation of data across the layer - this is equal to multiplying each input value by a corresponding weight and adding them, then adding a bias.
            nn.ReLU(), # The choice of activation function between the layers - this is equal to max(input, 0).
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x) # Make the values stored in each layer equal to the shapes of X.
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)
print(model)

Declaration of loss function and optimizer:

# Loss function

loss_fn = nn.CrossEntropyLoss()

# Stochastic Gradient Descent

optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

Definition of training function (with additional lines printing the weights before and after):

def train(dataloader, model, loss_fn, optimizer):
    # Number of samples
    size = len(dataloader.dataset)
    # Put module in training mode
    model.train()
    # Iterate through batches
    for batch, (X, y) in enumerate(dataloader):
        # Transfer data to the device to do calculations
        X, y = X.to(device), y.to(device)

        # Calculate predicted y values
        pred = model(X)
        # Get loss on predicted y values
        loss = loss_fn(pred, y)

        # Modify the weights in the NN
        loss.backward()
        before_step = list(model.parameters())[0].clone().detach().numpy()
        optimizer.step()
        after_step = list(model.parameters())[0].clone().detach().numpy()
        # Reset the gradients
        optimizer.zero_grad()
        print("Before: ", before_step)
        print("After:  ", after_step)

        # For every hundred batches, report the loss
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"Loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

Output when training:

Before:  [[-0.00293945 -0.03357641  0.02126637 ... -0.03217686  0.02163239
   0.01577387]
 [ 0.01675193  0.00785605  0.00159624 ...  0.02838659  0.01042882
  -0.00520851]
 [ 0.02488738  0.0200601   0.02569237 ... -0.01135333  0.00391812
  -0.00557524]
 ...
 [ 0.03510774  0.02370638 -0.03260146 ...  0.02252392  0.01040222
   0.01905873]
 [-0.03008673 -0.0211468   0.01441458 ... -0.0064997  -0.01389741
   0.01241069]
 [-0.02991394 -0.02001241 -0.01785357 ... -0.03501913  0.01994138
   0.00311066]]
After:   [[-0.00293945 -0.03357641  0.02126622 ... -0.03219151  0.02162728
   0.01577382]
 [ 0.01675189  0.00785571  0.00159275 ...  0.02837904  0.01042525
  -0.00520895]
 [ 0.02488733  0.02005999  0.02569141 ... -0.01130284  0.00391492
  -0.00557806]
 ...
 [ 0.03510772  0.02370632 -0.03260311 ...  0.02265108  0.01045562
   0.01906238]
 [-0.03008673 -0.0211468   0.01441456 ... -0.00650601 -0.01389765
...

As seen in the output above, the weights do not change, and they do not change across all iterations of training.

Any help would be greatly appreciated!

ptrblck · October 10, 2023, 3:58am

Your code works for me using random inputs:

X = torch.randn(1, 28*28)
y = torch.randint(0, 10, (1,))
X, y = X.to(device), y.to(device)

# Calculate predicted y values
pred = model(X)
# Get loss on predicted y values
loss = loss_fn(pred, y)

# Modify the weights in the NN
loss.backward()
before_step = list(model.parameters())[-1].clone().detach().cpu().numpy()
optimizer.step()
after_step = list(model.parameters())[-1].clone().detach().cpu().numpy()
print(model.linear_relu_stack[0].weight.grad)
# tensor([[-0.0005, -0.0077,  0.0055,  ..., -0.0035, -0.0005,  0.0014],
#         [-0.0009, -0.0140,  0.0100,  ..., -0.0064, -0.0009,  0.0026],
#         [-0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
#         ...,
#         [-0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
#         [-0.0021, -0.0321,  0.0230,  ..., -0.0147, -0.0021,  0.0059],
#         [-0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000]],
#        device='cuda:0')
print(model.linear_relu_stack[-1].weight.grad)
# tensor([[0.0000, 0.0245, 0.0088,  ..., 0.0422, 0.0000, 0.0112],
#         [0.0000, 0.0256, 0.0092,  ..., 0.0441, 0.0000, 0.0117],
#         [0.0000, 0.0282, 0.0101,  ..., 0.0485, 0.0000, 0.0129],
#         ...,
#         [0.0000, 0.0246, 0.0089,  ..., 0.0424, 0.0000, 0.0113],
#         [0.0000, 0.0266, 0.0096,  ..., 0.0459, 0.0000, 0.0122],
#         [0.0000, 0.0263, 0.0095,  ..., 0.0453, 0.0000, 0.0120]],
#        device='cuda:0')

# Reset the gradients
optimizer.zero_grad()
print("Before: ", before_step)
# Before:  [-0.03568246  0.00952333 -0.0185076  -0.0245529   0.03722464 -0.03158068
#  -0.02073667 -0.01687148  0.0014896   0.00559398]

print("After:  ", after_step)
# After:   [-0.03577552  0.00942608 -0.01861465 -0.02465206  0.03811001 -0.03167672
#  -0.02083459 -0.01696509  0.00138834  0.00549396]

print("Diff:, ", np.abs(after_step - before_step))
# Diff:,  [9.3065202e-05 9.7257085e-05 1.0704808e-04 9.9154189e-05 8.8537112e-04
#  9.6037984e-05 9.7915530e-05 9.3610957e-05 1.0126631e-04 1.0001566e-04]

and I also don’t see any obvious errors in the code.
To see the parameter updates better you could artificially increase the learning rate to e.g. 1.0 or even higher.

dalexay16 · October 10, 2023, 4:22am

Interesting, it works when I use random values too! Maybe it’s something to do with how the data is initialized?

Here’s the downloading of the datasets:

# Training data
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Testing data
testing_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

Here’s the conversion to DataLoaders:

# Batch size: Number of samples processed before model updates
batch_size = 64

# Create data loaders (iterables)
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(testing_data, batch_size=batch_size)

Is there anything here that might result in the model not updating?

ptrblck · October 10, 2023, 4:29am

FashionMNIST still works using my code.

dalexay16 · October 10, 2023, 5:24am

Would you mind sharing your exact code? I’m at a loss as to where or why my code is failing.

ptrblck · October 10, 2023, 5:26am

Sure, this is the code I used to test it:

# Get hardware for training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512), # The choice of transformation of data across the layer - this is equal to multiplying each input value by a corresponding weight and adding them, then adding a bias.
            nn.ReLU(), # The choice of activation function between the layers - this is equal to max(input, 0).
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x) # Make the values stored in each layer equal to the shapes of X.
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)
print(model)

loss_fn = nn.CrossEntropyLoss()

# Stochastic Gradient Descent

optimizer = torch.optim.SGD(model.parameters(), lr=1.)

training_data = datasets.FashionMNIST(
    root="./data",
    train=True,
    download=False,
    transform=transforms.ToTensor(),
)
loader = DataLoader(training_data, batch_size=64, shuffle=True)
X, y = next(iter(loader))

X, y = X.to(device), y.to(device)

# Calculate predicted y values
pred = model(X)
# Get loss on predicted y values
loss = loss_fn(pred, y)

# Modify the weights in the NN
loss.backward()
before_step = list(model.parameters())[-1].clone().detach().cpu().numpy()
optimizer.step()
after_step = list(model.parameters())[-1].clone().detach().cpu().numpy()
print(model.linear_relu_stack[0].weight.grad)
# tensor([[ 0.0000e+00,  1.2901e-07,  0.0000e+00,  ..., -4.0020e-05,
#           2.8034e-05,  3.4060e-06],
#         [ 0.0000e+00, -5.2218e-08,  0.0000e+00,  ...,  1.6958e-04,
#          -7.1564e-05, -1.0265e-05],
#         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
#           0.0000e+00,  0.0000e+00],
#         ...,
#         [ 0.0000e+00,  6.2321e-07,  0.0000e+00,  ..., -5.7360e-05,
#          -4.6703e-05, -3.7786e-06],
#         [ 0.0000e+00,  4.9584e-08,  0.0000e+00,  ...,  1.0065e-04,
#           1.1835e-04,  1.1986e-05],
#         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
#           0.0000e+00,  0.0000e+00]], device='cuda:0')

print(model.linear_relu_stack[-1].weight.grad)
# tensor([[ 2.8525e-03,  4.3362e-05,  1.2644e-03,  ...,  2.1098e-03,
#           1.3621e-02,  5.5132e-03],
#         [-6.6413e-03,  4.5133e-05,  1.3861e-03,  ..., -4.7151e-05,
#           6.9183e-03,  6.6764e-04],
#         [ 1.1066e-03,  4.6549e-05,  3.0659e-04,  ..., -1.0906e-03,
#          -3.6442e-03, -9.0843e-04],
#         ...,
#         [ 1.9374e-03,  4.1910e-05,  1.1706e-03,  ..., -1.3773e-03,
#           6.1790e-03,  5.8046e-03],
#         [ 5.3842e-04,  4.2445e-05,  2.0277e-05,  ...,  1.4919e-03,
#          -7.0348e-03, -1.4172e-03],
#         [-5.4650e-03,  4.2526e-05, -1.1702e-05,  ...,  1.5916e-03,
#          -5.4638e-03, -1.5523e-04]], device='cuda:0')

# Reset the gradients
optimizer.zero_grad()
print("Before: ", before_step)
# Before:  [-0.02619972 -0.02386615  0.03991131 -0.04317933 -0.01696173 -0.02053201
#   0.00938077 -0.02382676 -0.00550956 -0.01243122]

print("After:  ", after_step)
# After:   [-0.10896325  0.01213356  0.04316358 -0.07535928 -0.01138728  0.02181668
#   0.01297569 -0.00668069 -0.00976074 -0.00115265]

print("Diff:, ", np.abs(after_step - before_step))
# Diff:,  [0.08276353 0.03599971 0.00325226 0.03217996 0.00557445 0.04234869
#  0.00359492 0.01714607 0.00425118 0.01127857]