Why these two netwoks don't perform identically

muon · July 2, 2019, 12:07am

These two networks behave differently. First one works fine, the second one’s output does not make sense. Am I missing something in the second network? Single hidden layer with 2 units and 2 input features, single output.

Network 1

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

Network 2

class Net2(torch.nn.Module):
    def __init__(self, n_features, h_sizes, out_size):
        super(Net2, self).__init__()
    
        self.input = torch.nn.Linear(n_features, h_sizes[0])
        self.hidden = torch.nn.ModuleList()
        for k in range(len(h_sizes)-1):
            self.hidden.append(torch.nn.Linear(h_sizes[k], h_sizes[k+1]))
        # Output layer
        self.out = torch.nn.Linear(h_sizes[-1], out_size)

    def forward(self, x):

        # Feedforward
        x = F.relu(self.input(x))
        for layer in self.hidden:
            x = F.relu(layer(x))
        output= self.out(x)

        return output

Training code


def fit(net, x_train, y_train, epochs=20):
    if isinstance(x_train, np.ndarray):
        x_train = torch.from_numpy(x_train).float()
        y_train = torch.from_numpy(y_train).float()
        print(x_train.data.shape, y_train.data.shape)
        
    plt.ion()   # something about plotting

    for t in range(epochs):
        prediction = net(x_train)     # input x and predict based on x

        loss = loss_func(prediction, y_train)     # must be (1. nn output, 2. target)

        optimizer.zero_grad()   # clear gradients for next train
        loss.backward()         # backpropagation, compute gradients
        optimizer.step()        # apply gradients

        if t % (epochs//4) == 0:
            # plot and show learning process
            plt.cla()
            #plt.scatter(x_train.data.numpy(), y_train.data.numpy())
            #plt.plot(x_train.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
            plt.scatter(prediction.data.numpy(), y_train.data.numpy())#, 'r-', lw=5)
            plt.text(0.5, 0, 'Loss=%.4f' % loss.data.numpy(), fontdict={'size': 20, 'color':  'red'})
            plt.pause(0.1)
            
     
    plt.ioff()
    plt.show()
    return net, prediction.data.numpy()


def predict(net, x):
    if isinstance(x, np.ndarray):
        x = torch.from_numpy(x).float()
    pred = net(x)
    return pred.data.numpy()


optimizer = torch.optim.Adam(net.parameters(), lr=0.02)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss


net = Net(n_feature=2, n_hidden=10, n_output=1).float()     # define the network
net, pred = fit(net, x_train, y_train, epochs=2000)

net2  = Net2(2, h_sizes=[10],  out_size=1)     # define the network
print(net)  # net architecture
net2 = net2.float()
net2, pred = fit(net2, x_train, y_train, epochs=2000)

krishnamurthy · July 2, 2019, 12:30am

I don’t think it has much to do with the way the modules have been defined. (They seem correct; they have identical numbers of parameters, and dimensions)

How do your optimizers look like, for each network? And could you elaborate on this?

ckquote

muon · July 2, 2019, 12:34am

updated. same Adam optimizer for both.

krishnamurthy · July 2, 2019, 2:20am

What’s weird in the output? Do you have a snippet or something?

lugiavn · July 2, 2019, 4:29am

Well that is the optimizer for net only, not net2

muon · July 2, 2019, 12:51pm

Thank you!! I so overlooked that!
Obviously it was not learning, that’s what was weird.