Why these two netwoks don't perform identically

These two networks behave differently. First one works fine, the second one’s output does not make sense. Am I missing something in the second network? Single hidden layer with 2 units and 2 input features, single output.

Network 1

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

Network 2

class Net2(torch.nn.Module):
    def __init__(self, n_features, h_sizes, out_size):
        super(Net2, self).__init__()
        self.input = torch.nn.Linear(n_features, h_sizes[0])
        self.hidden = torch.nn.ModuleList()
        for k in range(len(h_sizes)-1):
            self.hidden.append(torch.nn.Linear(h_sizes[k], h_sizes[k+1]))
        # Output layer
        self.out = torch.nn.Linear(h_sizes[-1], out_size)

    def forward(self, x):

        # Feedforward
        x = F.relu(self.input(x))
        for layer in self.hidden:
            x = F.relu(layer(x))
        output= self.out(x)

        return output

Training code

def fit(net, x_train, y_train, epochs=20):
    if isinstance(x_train, np.ndarray):
        x_train = torch.from_numpy(x_train).float()
        y_train = torch.from_numpy(y_train).float()
        print(x_train.data.shape, y_train.data.shape)
    plt.ion()   # something about plotting

    for t in range(epochs):
        prediction = net(x_train)     # input x and predict based on x

        loss = loss_func(prediction, y_train)     # must be (1. nn output, 2. target)

        optimizer.zero_grad()   # clear gradients for next train
        loss.backward()         # backpropagation, compute gradients
        optimizer.step()        # apply gradients

        if t % (epochs//4) == 0:
            # plot and show learning process
            #plt.scatter(x_train.data.numpy(), y_train.data.numpy())
            #plt.plot(x_train.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
            plt.scatter(prediction.data.numpy(), y_train.data.numpy())#, 'r-', lw=5)
            plt.text(0.5, 0, 'Loss=%.4f' % loss.data.numpy(), fontdict={'size': 20, 'color':  'red'})
    return net, prediction.data.numpy()

def predict(net, x):
    if isinstance(x, np.ndarray):
        x = torch.from_numpy(x).float()
    pred = net(x)
    return pred.data.numpy()

optimizer = torch.optim.Adam(net.parameters(), lr=0.02)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

net = Net(n_feature=2, n_hidden=10, n_output=1).float()     # define the network
net, pred = fit(net, x_train, y_train, epochs=2000)

net2  = Net2(2, h_sizes=[10],  out_size=1)     # define the network
print(net)  # net architecture
net2 = net2.float()
net2, pred = fit(net2, x_train, y_train, epochs=2000)

I don’t think it has much to do with the way the modules have been defined. (They seem correct; they have identical numbers of parameters, and dimensions)

How do your optimizers look like, for each network? And could you elaborate on this?


updated. same Adam optimizer for both.

What’s weird in the output? Do you have a snippet or something?

Well that is the optimizer for net only, not net2

Thank you!! I so overlooked that!
Obviously it was not learning, that’s what was weird.