Issues with loss while porting numpy to pytorch

As an exercise I’m rewriting a simple numpy example in pytorch and so far I’ve been having problems to match the results. As a pytorch newbie it’s highly possible I have done some stupid mistake. So far it seems the loss always converges to 0.25 (in my example) and I have no idea why.
Ah, yes I’m still on 0.3.1, perhaps I should also consider an upgrade.
Thanks

import torch
import numpy as np
from torch.autograd import Variable

N, D_in, H, D_out = 4, 3, 4, 1
x_np = np.array([[0,0,1],
                 [0,1,1],
                 [1,0,1],
                 [1,1,1]])

x = Variable(torch.Tensor(x_np), requires_grad=True)
y = np.array([[0],
              [1],
              [1],
              [0]])
y = Variable(torch.Tensor(y))

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduce=False)
learning_rate = 1e-4
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print loss 
    optimizer.zero_grad()
    loss.backward(torch.ones(4).view(-1,1))
    optimizer.step()

predicted = model.forward(Variable(torch.from_numpy(x_np).float())).data.numpy()
print '\n', predicted

numpy version:

import numpy as np
import pdb

def nonlin(x,deriv=False):
    ''' sigmoid'''
    if(deriv==True):
        return x*(1-x)
    return 1/(1+np.exp(-x))
   
X = np.array([[0,0,1],
              [0,1,1],
              [1,0,1],
              [1,1,1]])
               
y = np.array([[0],
            [1],
            [1],
            [0]])

np.random.seed(1)

# randomly initialize our weights with mean 0
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1

for j in xrange(500):
    l0 = X                      
    l1 = nonlin(np.dot(l0,syn0))
    l2 = nonlin(np.dot(l1,syn1))
    l2_error = y - l2
    print l2_error
    l2_delta = l2_error*nonlin(l2,deriv=True)
    l1_error = l2_delta.dot(syn1.T)
    l1_delta = l1_error * nonlin(l1,deriv=True)
    syn1 += l1.T.dot(l2_delta)
    syn0 += l0.T.dot(l1_delta)

print '\n', l2

I think I’ve found the issue. The problem was with the choice of optimizer. Switching from sgd to adam seems to work. Including the changes for reference.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim

import matplotlib.pyplot as plt
from tqdm import tqdm

np.set_printoptions(precision=4, linewidth=500, suppress=True)

class Example(nn.Module):
    def __init__(self, D_in, D_out, total):
        super(Example, self).__init__()
        self.syn0 = nn.Linear(D_in,total) 
        self.syn1 = nn.Linear(total,D_out) 

    def forward(self, x):
        x = F.sigmoid(self.syn0(x))
        x = F.sigmoid(self.syn1(x))
        return x

X_np = np.array([[0,0,1],
                 [0,1,1],
                 [1,0,1],
                 [1,1,1]])
X = Variable(torch.Tensor(X_np), requires_grad=True)
y = np.array([[0],
            [1],
            [1],
            [0]])
y = Variable(torch.Tensor(y))

model = Example(3,1,4) #in, out, total

# optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # !!! 
optimizer = optim.Adam(model.parameters(), lr=0.001)

losses = []
for t in tqdm(range(10000)):
    y_pred = model(X)
    loss = (y_pred - y).pow(2).sum()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.data.numpy())

plt.plot(losses)
plt.show()
    
predicted = model.forward(Variable(torch.from_numpy(X_np).float())).data.numpy()
print '\n', predicted

In your first example you dind’t use a non-linearity between the layers as far as I see it, so that your model basically was just a matrix multiplication.

Thanks, indeed torch.nn.Sigmoid() helps but seems the sgd was not a great choice here. Changing to adam seems to give good results.