Same values in every epoch when training

I’ve tried to create a simple graph neural network with pytorch geometric. However, I’m getting the same loss for every epoch while training. Here’s the code for the network:

train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
## Building the Graph Neural Network
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 50)
        self.conv2 = GCNConv(50, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)
device = torch.device('cuda')
model = Net().double().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
crit = torch.nn.BCELoss()

def train():
    model.train()

    loss_all = 0
    for data in train_loader:
        optimizer.zero_grad()
        data = data.to(device)

        output = model(data)
        label = data.y.to(device)

        loss = crit(output.double(), label.double())
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

from sklearn.metrics import zero_one_loss

def evaluate(loader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            pred = model(data).detach().cpu().numpy().astype('uint8').flatten()
            label = data.y.detach().cpu().numpy().astype('uint8')

            predictions.append(pred)
            labels.append(label)
        
        predictions, labels = np.concatenate(predictions), np.concatenate(labels)
        
        # Calculate accuracy
        acc = zero_one_loss(labels, predictions)
        return acc

for epoch in range(1, 200):
    loss = train()
    train_acc = evaluate(train_loader)
    val_acc = evaluate(val_loader)    
    test_acc = evaluate(test_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, test_acc))

Here’s the output:

Epoch: 001, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 002, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 003, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 004, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 005, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 006, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 007, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 008, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 009, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377
Epoch: 010, Loss: 19391.92254, Train Auc: 0.54593, Val Auc: 0.53062, Test Auc: 0.56377

Another thing that I’ve noticed is that I’m getting only zeros in the train function when calling output = model(data). Does this mean that I have issue with the data or with my neural network?

I appreciate the help

F.log_softmax is usually used with nn.NLLLoss, while you are using nn.BCELoss, which expects a sigmoid output.
The former can be used for a multi-class classification, the latter for a binary classification or a multi-label classification.
I would recommend to remove the F.log_softmax and replace it with torch.sigmoid or change the criterion, if this fits your use case.

I am unable to train the model:

@ptrblck I have tried everything… but couldn’t solve it :frowning:

class LogisticRegression(nn.Module):

def __init__(self, input_size, output_size, hidden_size):
    super(LogisticRegression, self).__init__()
    self.linear = nn.Linear(input_size, hidden_size)
    self.relu = nn.Sigmoid()
    self.linear2 = nn.Linear(hidden_size, output_size)
    self.initialize()

def initialize(self):
    nn.init.xavier_uniform(self.linear.weight.data)
    nn.init.xavier_uniform(self.linear2.weight.data)
    self.linear.bias.data.zero_()
    self.linear2.bias.data.zero_()

def forward(self, x):
    hidden = self.linear(x)
    out = self.relu(hidden)
    out = self.linear2(out)
    return out

Model Training

l_batch_sz = 16

X = [torch.from_numpy(np.array(item)).float() for item in dev_loss]
Y = [torch.from_numpy(np.array(item)).float() for item in noise_flag]
X = Variable(torch.FloatTensor(X), requires_grad = True).cuda()
Y = Variable(torch.FloatTensor(Y), requires_grad = True).cuda()

l_dataset = data.TensorDataset(X,Y)
l_loader = data.DataLoader(l_dataset, batch_size=l_batch_sz, drop_last=True)

l_input_size = l_batch_sz
l_output_size = l_batch_sz
l_hidden_size = 64
l_total_epoch = 100
l_learning_rate = 0.01

l_model = LogisticRegression(l_input_size, l_output_size, l_hidden_size)
l_model.cuda()
l_model.train()
print('l_model.parameters() : ',l_model.parameters())
l_criterion = nn.BCEWithLogitsLoss()
l_optimizer = torch.optim.Adam(l_model.parameters(), lr=l_learning_rate)

import ipdb

for epoch in range(l_total_epoch):

  for batch_idx, (x, y) in enumerate(l_loader):
  
        l_optimizer.zero_grad() 
        output = l_model(x)
        l_loss = l_criterion(output, y)
        
        l_loss.backward()
        l_optimizer.step()
                            
        if (batch_idx + 1) % 1 == 0:
            print('\nEpoch: [%d/%d], Loss: %.4f' %(epoch+1, l_total_epoch, l_loss))

Change the nn.init methods to the inplace versions and remove the .data usage as:

    nn.init.xavier_uniform_(self.linear.weight)
    nn.init.xavier_uniform_(self.linear2.weight)

Also, I’m unsure if this fits your use case, but you are setting the batch size to the input and output size, so you should double check, if this is indeed correct.

Play around with some hyperparameters, such as lowering the learing rate, changing the nn.Sigmoid for nn.ReLU or another non-linearity etc. and try to overfit a small dataset first (e.g. just 10 samples).