Custom SGD was not updating parameters

I was learning pytorch from d2l website and writing a simple linear regression model. Here is my optimizer:

class SGD():
    def __init__(self, params, lr):
        self.params = params
        self.lr = lr
    def step(self):
        for param in self.params:
            param -= self.lr * param.grad
    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()

However, it could not update model parameters during training. I do not known what was going wrong.

Of note, the model parameters can be updated with the builtin optimizer: optim.SGD(model.parameters(), lr=self.learning_rate). Therefore, I suspect the problem is with my naive SGD implementation.

Below is a reproducible example:

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

import warnings
warnings.filterwarnings("ignore")


class SyntheticRegressionData():
    """synthetic tensor dataset for linear regression from S02"""
    def __init__(self, w, b, noise=0.01, num_trains=1000, num_vals=1000, batch_size=32):
        self.w = w
        self.b = b
        self.noise = noise
        self.num_trains = num_trains
        self.num_vals = num_vals
        self.batch_size = batch_size
        n = num_trains + num_vals
        self.X = torch.randn(n, len(w))
        self.y = torch.matmul(self.X, w.reshape(-1, 1)) + b + noise * torch.randn(n, 1)
    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        tensors = tuple(a[indices] for a in tensors)
        dataset = TensorDataset(*tensors)
        return DataLoader(dataset, self.batch_size, shuffle=train)
    def get_dataloader(self, train=True):
        indices = slice(0, self.num_trains) if train else slice(self.num_trains, None)
        return self.get_tensorloader((self.X, self.y), train, indices)
    def train_dataloader(self):
        return self.get_dataloader(train=True)
    def val_dataloader(self):
        return self.get_dataloader(train=False)
    

class LinearNetwork(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_features, out_features))
        self.bias = nn.Parameter(torch.randn(out_features))
    def forward(self, x):
        return torch.matmul(x, self.weight) + self.bias


class SGD():
    def __init__(self, params, lr):
        self.params = params
        self.lr = lr
    def step(self):
        for param in self.params:
            param -= self.lr * param.grad
    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()


class MyTrainer():
    """
    custom trainer for linear regression
    """
    def __init__(self, max_epochs=10, learning_rate=1e-3):
        self.max_epochs = max_epochs
        self.learning_rate = learning_rate
    def fit(self, model, train_dataloader, val_dataloader=None):
        self.model = model
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.optim = SGD(self.model.parameters(), lr=self.learning_rate)
        self.loss = nn.MSELoss()
        self.num_train_batches = len(train_dataloader)
        self.num_val_batches = len(val_dataloader) if val_dataloader is not None else 0

        self.epoch = 0
        for epoch in range(self.max_epochs):
            self.fit_epoch()
    def fit_epoch(self):
        # train
        self.model.train()
        avg_loss = 0
        for x, y in self.train_dataloader:
            self.optim.zero_grad()
            y_hat = self.model(x)
            loss = self.loss(y_hat, y)
            loss.backward()
            self.optim.step()
            avg_loss += loss.item()
        avg_loss /= self.num_train_batches
        print(f'epoch {self.epoch}: train_loss={avg_loss:>8f}')
        # test
        if self.val_dataloader is not None:
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for x, y in self.val_dataloader:
                    y_hat = self.model(x)
                    loss = self.loss(y_hat, y)
                    val_loss += loss.item()
            val_loss /= self.num_val_batches
            print(f'epoch {self.epoch}: val_loss={val_loss:>8f}')
        self.epoch += 1


torch.manual_seed(2024) 

trainer = MyTrainer(max_epochs=10, learning_rate=0.01)
model = LinearNetwork(2, 1)

torch.manual_seed(2024)
w = torch.tensor([2., -3.])
b = torch.Tensor([1.])
noise = 0.01
num_trains = 1000
num_vals = 1000
batch_size = 64
data = SyntheticRegressionData(w, b, noise, num_trains, num_vals, batch_size)
train_data = data.train_dataloader()
val_data = data.val_dataloader()

trainer.fit(model, train_data, val_data)

Here is the output:

epoch 0: train_loss=29.762345
epoch 0: val_loss=29.574341
epoch 1: train_loss=29.547140
epoch 1: val_loss=29.574341
epoch 2: train_loss=29.559777
epoch 2: val_loss=29.574341
epoch 3: train_loss=29.340937
epoch 3: val_loss=29.574341
epoch 4: train_loss=29.371171
epoch 4: val_loss=29.574341
epoch 5: train_loss=29.649407
epoch 5: val_loss=29.574341
epoch 6: train_loss=29.717251
epoch 6: val_loss=29.574341
epoch 7: train_loss=29.545675
epoch 7: val_loss=29.574341
epoch 8: train_loss=29.456314
epoch 8: val_loss=29.574341
epoch 9: train_loss=29.537769
epoch 9: val_loss=29.574341

As you can see, the loss remains the same after 10 epochs, indicating that the parameters were not updated at all.

The problem is solved. The return of model.parameters() is an iterator/generator and it was exhausted in the iteration the first time I called the zero_grad method of my custom SGD. Therefore, later calls of step or zero_grad did not have any effects. A corrected version is as below:

class SGD():
    def __init__(self, params, lr):
        self.params = list(params)  # convert to list here
        self.lr = lr
    def step(self):
        for param in self.params:
            param.data -= self.lr * param.grad   # update .data
    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()

Hi @mt1022,

You ideally shouldn’t use the .data attribute of a pytorch tensor as it’s now deprecated. If you want to create a Custom SGD optimizer, you should ideally define your own torch.optim object.

1 Like

Thanks for your suggestions :smiley: I am new to pytorch and It takes me some time to realize that I should use torch.no_grad instead of .data :sweat_smile:.

Ah yes, your step() method should be decoratored with a torch.no_grad decorator, i.e.

@torch.no_grad()
def step(self):
1 Like