Constant prediction in classification task

albesc · November 13, 2020, 1:56pm

Hi there,
I’m trying to use a NN for a classification into two classes. As this did not work with my dataset (constant prediction for each batch) I wrote a simpler version of the code, but can’t still find the problem.

Here’s a minimal version code:

class Model(nn.Module):
def __init__(self, input_size, hidden_sizes_fc=[100, 2]):
    super().__init__()
   
    self.fc_list = nn.ModuleList([nn.Linear(input_size, hidden_sizes_fc[0])])
    for hidden_size_fc_ind in range(0, len(hidden_sizes_fc)-1):
        self.fc_list.append(nn.Linear(hidden_sizes_fc[hidden_size_fc_ind],
                                      hidden_sizes_fc[hidden_size_fc_ind+1]))


def forward(self, x):
    relu = nn.ReLU()
    for i, FC in enumerate(self.fc_list):
        x = FC(x)
        x = relu(x)
    return x

def train_std_nn(net, train, val, epochs, loss_fn):
    optimiser = torch.optim.Adam(net.parameters(), lr=0.0001)
    train_losses_epochs = []
    val_score_epochs = []
    net.train()
    for epoch in trange(epochs):
        train_loss = 0.0
        total_computations = 0
        for X, Y in train:
            output = net(X)
            loss = loss_fn(output, Y)
            loss.backward()
            optimiser.step()
            train_loss += loss.item()
            total_computations += Y.shape[0]
        train_losses_epochs.append(train_loss / total_computations)
    for X_val, Y_val in val:
        output = net(X_val)
        top_p, top_class = torch.topk(output, 1, dim=1)
        pred = torch.flatten(top_class).detach().numpy()
        val_score_epochs.append(roc_auc_score(Y_val.numpy(), pred))
    return net, train_losses_epochs, val_score_epochs


epochs = 10
batch_size = 128

hidden_layers_size = [16, 2]
net = Model(input_size=11, hidden_sizes_fc=hidden_layers_size).double()
loss_fn = nn.CrossEntropyLoss()

aaa = torch.Tensor(np.random.rand(15, 11)).double()#.type(torch.LongTensor)
bbb = torch.Tensor(np.random.randint(0, 2, (15))).type(torch.LongTensor)

net, train_losses_epochs, val_score_epochs = train_std_nn(net, [[aaa, bbb]], [[aaa, bbb]], epochs, loss_fn)

I’ve plotted some graphs of the training loss and validation score (area under the curve). But the model doesn’t seem to learn anything… Training loss does random stuff (mainly decreasing but depends on the run) and auc is always 0.5

Thanks for help!

klory · November 13, 2020, 3:16pm

try use softmax() instead of relu() for the output layer

albesc · November 13, 2020, 3:26pm

I also tried

def forward(self, x):
    relu = nn.ReLU()
    sm = nn.Softmax(dim=1)
    for i, FC in enumerate(self.fc_list):
            x = FC(x)
            x = relu(x)
     x = sm(x)
    return x

but that did not work either

klory · November 14, 2020, 12:06am

You are still using relu for the output layer, try

def forward(self, x):
    relu = nn.ReLU()
    sm = nn.Softmax(dim=1)
    x = self.fc_list[0](x)
    x = relu(x)
    x = self.fc_list[1](x)
    x = sm(x)
    return x

albesc · November 14, 2020, 7:28am

Yeah sorry, what I meant is: I both tried

def forward(self, x):
    relu = nn.ReLU()
    sm = nn.Softmax(dim=1)
    for i, FC in enumerate(self.fc_list):
            x = FC(x)
            x = relu(x)
     x = sm(x)
    return x

and

def forward(self, x):
    relu = nn.ReLU()
    sm = nn.Softmax(dim=1)
    x = self.fc_list[0](x)
    x = relu(x)
    x = self.fc_list[1](x)
    x = sm(x)
    return x

neither is working

klory · November 14, 2020, 2:51pm

sorry my bad, but I think you forgot to call 'optimiser.zero_grad()beforeloss.backward()`

albesc · November 14, 2020, 3:48pm

thanks, I inserted that, but still get very bed predictions somehow…

ptrblck · November 16, 2020, 7:37am

nn.CrossEntropyLoss expects raw logits as the model output, so remove the softmax and relu and pass the output of the last linear layer to the loss function.
Also, as explained before, you are not zeroing out the gradients.

albesc · November 16, 2020, 8:53am

Thanks! The code now looks like this:

class Model(nn.Module):
def __init__(self, input_size, hidden_sizes_fc=[100, 2]):
    super().__init__()
   
    self.fc_list = nn.ModuleList([nn.Linear(input_size, hidden_sizes_fc[0])])
    for hidden_size_fc_ind in range(0, len(hidden_sizes_fc)-1):
        self.fc_list.append(nn.Linear(hidden_sizes_fc[hidden_size_fc_ind],
                                      hidden_sizes_fc[hidden_size_fc_ind+1]))


def forward(self, x):
    relu = nn.ReLU()
    x = self.fc_list[0](x)
    x = relu(x)
    x = self.fc_list[1](x)
    return x

def train_std_nn(net, train, val, epochs, loss_fn):
    optimiser = torch.optim.Adam(net.parameters(), lr=0.0001)
    train_losses_epochs = []
    val_score_epochs = []
    net.train()
    for epoch in trange(epochs):
        train_loss = 0.0
        total_computations = 0
        for X, Y in train:
            output = net(X)
            loss = loss_fn(output, Y)
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            train_loss += loss.item()
            total_computations += Y.shape[0]
        train_losses_epochs.append(train_loss / total_computations)
    for X_val, Y_val in val:
        output = net(X_val)
        top_p, top_class = torch.topk(output, 1, dim=1)
        pred = torch.flatten(top_class).detach().numpy()
        val_score_epochs.append(roc_auc_score(Y_val.numpy(), pred))
    return net, train_losses_epochs, val_score_epochs


epochs = 200
batch_size = 4

hidden_layers_size = [16, 2]
net = Model(input_size=11, hidden_sizes_fc=hidden_layers_size).double()
loss_fn = nn.CrossEntropyLoss()

aaa = torch.Tensor(np.random.rand(15, 11)).double()#.type(torch.LongTensor)
bbb = torch.Tensor(np.random.randint(0, 2, (15))).type(torch.LongTensor)

net, train_losses_epochs, val_score_epochs = train_std_nn(net, [[aaa, bbb]], [[aaa, bbb]], epochs, loss_fn)

With e.g. input vector [1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0] after 200 epochs (which I would consider enough to overfitt the data) the predicition is still [1 0 1 1 1 1 1 1 1 1 1 1 1 1 1]. I ran the code a few times and always get very bad predictions.

ptrblck · November 16, 2020, 10:08am

I can perfectly overfit random samples using your code, so you might want to increase the learning rate to let it converge faster (it still converges with your lr or 1e-4, but takes more epochs):

class Model(nn.Module):
    def __init__(self, input_size, hidden_sizes_fc=[100, 2]):
        super().__init__()
       
        self.fc_list = nn.ModuleList([nn.Linear(input_size, hidden_sizes_fc[0])])
        for hidden_size_fc_ind in range(0, len(hidden_sizes_fc)-1):
            self.fc_list.append(nn.Linear(hidden_sizes_fc[hidden_size_fc_ind],
                                          hidden_sizes_fc[hidden_size_fc_ind+1]))
    
    
    def forward(self, x):
        relu = nn.ReLU()
        x = self.fc_list[0](x)
        x = relu(x)
        x = self.fc_list[1](x)
        return x

hidden_layers_size = [16, 2]
net = Model(input_size=11, hidden_sizes_fc=hidden_layers_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

data = torch.rand(15, 11)
target = torch.randint(0, 2, (15,))

for epoch in range(1000):
    optimizer.zero_grad()
    output = net(data)
    loss = loss_fn(output, target)
    loss.backward()
    optimizer.step()
    preds = torch.argmax(output, dim=1)
    print('epoch {}, loss {:.3f}, acc {}'.format(
        epoch, loss.item(), (preds==target).float().mean()))