BCEWithLogitsLoss RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Hi, I am trying to run following network:

class CustomVGG16(torch.nn.Module):
    def __init__(self):
        super(CustomVGG16, self).__init__()
        self.vgg = torchvision.models.vgg16_bn(pretrained = True)
        self.vgg.classifier[-1] = torch.nn.Linear(4096,25)
        self.softmax = torch.nn.Softmax() 
        

    def forward(self, x):
        x = self.vgg(x)
        x = self.softmax(x)
        return x

with following parameters:

criterion = torch.nn.BCEWithLogitsLoss() # log loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005) 

and my training loop is as follows:

def train(model,criterion,optimizer,train_loader,epoch=1, val_loader = None):
    accuracy = []
    val_accuracy = []
    
    train_loss = []
    model = model.to(device)

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma = 0.1)

    for ep in range(epoch):
        running_loss = 0.0
        correct = 0
        total = 0
        start_time = time.time()
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data

            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(torch.argmax(outputs,axis=1).float(), labels.float())
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            correct += (outputs.argmax(1) == labels).float().sum()
            total += len(labels)
        accuracy_local = (correct / total)*100
        accuracy_local = accuracy_local.data.cpu().numpy()
        
        train_loss.append(running_loss)
        accuracy.append(accuracy_local)
        val_acc, val_loss = valid(model,criterion,optimizer,val_loader)
        val_accuracy.append(val_acc)
        scheduler.step()

        print('EPOCH: {:} Accuracy: {:.2f}% Val_Accuracy: {:.2f}% \
        Train_Loss: {:.2f} Validation_Loss: {:.2f} Time: {:.2f} seconds'.format(ep, accuracy_local, val_acc,running_loss, val_loss, time.time() - start_time))

    return accuracy, val_accuracy, train_loss, val_loss

and validation loop:

def valid(model,criterion,optimizer,val_loader):
    running_loss = 0.0
    correct = 0
    total = 0
    for i, data in enumerate(val_loader, 0):
        inputs, labels = data

        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(torch.argmax(outputs,axis=1).float(), labels.float())
        running_loss += loss.item()
        correct += (outputs.argmax(1) == labels).float().sum()
        total += len(labels)
    return ((correct / total)*100).data.cpu().numpy(), running_loss

for BCEWithLogitLoss, one hot encoded outputs were not working and were giving shape error so I changed it to a 1D array but still this error, I searched through previous answers on this error but nothing helped.

I also tried loss.require_grad = True but it also didn’t work.

What could be the problem here?
Thanks.

You are detaching the computation graph by calling torch.argmax on the model output as this operation is not differentiable:

torch.argmax(outputs,axis=1).float()

nn.BCCEWithLogitsLoss expects logits as the model output and can be used for a multi-label classification (zero, one, or more classes can be active for each sample).
This would also mean that you should remove the softmax operation in your model.

So what could be the solution? other than removing softmax, as I have to include it in my architecture

Remove the softmax and the torch.argmax.

Also, if your target is one-hot encoded, I assume you are dealing with a multi-class classification, so replace nn.BCEWithLogitsLoss with nn.CrossEntropyLoss.

1 Like