Hi all,

I am following a pytorch tutorial and find some behavior I did not expect. In particular, the convergence of a minimizer is much slower when using autograd and it is even slower still when I wrap my (linear) model in a pytorch class.

For starters, in the code below:

```
import numpy as np
# Compute every step manually
# Linear regression
# f = w * x
# suppose : f = 2 * x
X = np.array([1, 2, 3, 4], dtype=np.float32)
Y = np.array([2, 4, 6, 8], dtype=np.float32)
w = 0.0
# model output
def forward(x):
return w * x
# loss = MSE
def loss(y, y_pred):
return ((y_pred - y)**2).mean()
# J = MSE = 1/N * (w*x - y)**2
# dJ/dw = 1/N * 2x(w*x - y)
def gradient(x, y, y_pred):
return np.dot(2*x, y_pred - y).mean()
print(f'Prediction before training: f(5) = {forward(5):.7f}')
# Training
learning_rate = 0.01
n_iters = 100
for epoch in range(n_iters):
# predict = forward pass
y_pred = forward(X)
# loss
l = loss(Y, y_pred)
# calculate gradients
dw = gradient(X, Y, y_pred)
# update weights
w -= learning_rate * dw
if epoch % 2 == 0:
print(f'epoch {epoch+1}: w = {w:.7f}, loss = {l:.7f}')
print(f'Prediction after training: f(5) = {forward(5):.7f}')
```

This code will converge on the correct linear weight in about 20 iterations. (This is setting machine precision of 7 digits for float32). And the loss stops decreasing around iteration 13.

But the next code (where autograd is used) never converges on the correct weight. It also does not give the correct answer for the test element, within machine precision. The loss stops decreasing around iteration 70.

```
# Here we replace the manually computed gradient with autograd
import torch
# Linear regression
# f = w * x
# here : f = 2 * x
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)
# model output
def forward(x):
return w * x
# loss = MSE
def loss(y, y_pred):
return ((y_pred - y)**2).mean()
print(f'Prediction before training: f(5) = {forward(5).item():.7f}')
# Training
learning_rate = 0.01
n_iters = 100
for epoch in range(n_iters):
# predict = forward pass
y_pred = forward(X)
# loss
l = loss(Y, y_pred)
# calculate gradients = backward pass
l.backward()
# update weights
#w.data = w.data - learning_rate * w.grad
with torch.no_grad():
w -= learning_rate * w.grad
# zero the gradients after updating
w.grad.zero_()
if epoch % 1 == 0:
print(f'epoch {epoch+1}: w = {w.item():.7f}, loss = {l.item():.7f}')
print(f'Prediction after training: f(5) = {forward(5).item():.7f}')
```

Lastly when I use the pytorch class Linear for my model, the performance gets quite bad.

```
# 1) Design model (input, output, forward pass with different layers)
# 2) Construct loss and optimizer
# 3) Training loop
# - Forward = compute prediction and loss
# - Backward = compute gradients
# - Update weights
import torch
import torch.nn as nn
# Linear regression
# f = w * x
# here : f = 2 * x
# 0) Training samples, watch the shape!
X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32)
Y = torch.tensor([[2], [4], [6], [8]], dtype=torch.float32)
n_samples, n_features = X.shape
print(f'#samples: {n_samples}, #features: {n_features}')
# 0) create a test sample
X_test = torch.tensor([5], dtype=torch.float32)
# 1) Design Model, the model has to implement the forward pass!
# Here we can use a built-in model from PyTorch
input_size = n_features
output_size = n_features
# we can call this model with samples X
#model = nn.Linear(input_size, output_size)
class LinearRegression(nn.Module):
def __init__(self, input_dim, output_dim):
super(LinearRegression, self).__init__()
# define diferent layers
self.lin = nn.Linear(input_dim, output_dim)
def forward(self, x):
return self.lin(x)
model = LinearRegression(input_size, output_size)
print(f'Prediction before training: f(5) = {model(X_test).item():.7f}')
# 2) Define loss and optimizer
learning_rate = 0.01
n_iters = 10000
loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# 3) Training loop
for epoch in range(n_iters):
# predict = forward pass with our model
y_predicted = model(X)
# loss
l = loss(Y, y_predicted)
# calculate gradients = backward pass
l.backward()
# update weights
optimizer.step()
# zero the gradients after updating
optimizer.zero_grad()
if epoch % 100 == 0:
[w, b] = model.parameters() # unpack parameters
print(f'epoch {epoch+1}: w = {w[0][0].item():.7f}, loss = {l.item():.7f}')
print(f'Prediction after training: f(5) = {model(X_test).item():.7f}')
```

Here the code again never converges to the correct weight, and even after 2000 iterations the loss is still decreasing.

Is these something basic I am missing about how pytorch handles autograd and class wrapping? Is the default precision float16 or something? This behavior is unexpected for me.

Thank you very much!