I have these two codes that are supposed to give the same result.

```
import torch
torch.manual_seed(1)
D_in, D_out, H, N = 1000, 10, 100, 100
dtype = torch.float64
device = torch.device('cpu')
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out)
)
model.double()
loss_fn = torch.nn.MSELoss(reduction='sum')
l = 1e-6
for i in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(i, loss.item())
model.zero_grad()
loss.backward()
# grad_w_1 = 2.0*(y_pred-y).dot(w_2.T).T.dot(x).T
# grad_w_1[grad_w_1 < 0] = 0
with torch.no_grad():
for param in model.parameters():
param -= l * param.grad
```

and

```
import torch
torch.manual_seed(1)
D_in, D_out, H, N = 1000, 10, 100, 100
dtype = torch.float64
device = torch.device('cpu')
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
w_1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w_2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
l = 1e-6
for i in range(500):
h = x.mm(w_1) #NxH
h_relu = h.clamp(min=0)
y_pred = h_relu.mm(w_2) #NxD_out
loss = (y_pred - y).pow(2).sum()
print(i, loss.item())
loss.backward()
# grad_w_1 = 2.0*(y_pred-y).dot(w_2.T).T.dot(x).T
# grad_w_1[grad_w_1 < 0] = 0
with torch.no_grad():
w_1 -= l * w_1.grad
w_2 -= l * w_2.grad
w_1.grad.zero_()
w_2.grad.zero_()
```

The first one’s result is:

```
...
497 791.8494193212754
498 791.394974676931
499 790.9410212657938
```

while the second one’s is:

```
...
497 398.63481097164174
498 396.12781489790547
499 393.64120085253165
```

How do I solve this?

Thanks