Element 0 of tensors does not require grad and does not have a grad_fn

facepalm. yes my problem is there is torch.autograd.set_grad_enabled(False) in the notebook. thank you @ptrblck.

Hello guys

I am trying to train a Se_ResNext model using pretrainedmodels

I want to finetune the model in the last layer only to not have 1000 output classes but only three. I have tried many things that are suggested here but nothing seems to work for me and I keep getting the same error

element 0 of tensors does not require grad and does not have a grad_fn

Which only goes away if I do not use param.required = False. But I need the model to have the pretrained weights in all the layers.

Any suggestions would be very helpful thank you :grinning:

@ptrblck I don’t know what I’ve done wrong obviously I don’t have much experience I 've tried almost everything you’ve suggested in your answers. So your help is more than appreciated

model = pretrainedmodels.__dict__["se_resnext50_32x4d"](pretrained="imagenet",num_classes = 1000)



for param in model.parameters():
    param.requires_grad = False

num_ftrs = model.last_linear.in_features
model = nn.Sequential(*list(model.children())[:-1])
model.fc = nn.Linear(num_ftrs,3)

import torch.optim as optim
acc_list = []
running_loss = 0.0

opt = optim.Adam(model.fc.parameters())
criterion = nn.CrossEntropyLoss()

train_dl = DataLoader(trainset, batch_size=64)
val_dl = DataLoader(valset,batch_size=64,)
total_step = len(train_dl)
eval_accu = []
val_correct = 0
val_total = 0
val_running_loss = 0.0

for epoch in range(3):  # loop over the dataset 
    model.train(True)
    for i, (inputs,labels) in enumerate(train_dl, 0):

        opt.zero_grad()
       
        outputs = model(inputs)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        loss.backward()
        opt.step()
       
        total = labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        
        correct = (predicted == labels).sum().item()
        acc_list.append(correct / total)

        # print statistics
        running_loss += loss.item()
    model.train(False)   
    with torch.no_grad():
        model.eval()
        for data in val_dl:
            images, labels = data
           
            outputs = model(images)
           
            val_loss = criterion(outputs, torch.max(labels, 1)[1])
            
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            
            val_correct = (predicted == labels).sum().item()
            eval_accu.append(val_correct / val_total)
            val_running_loss += val_loss.item()
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.3f}, Accuracy: {:.2f}%, Val_Accuracy:{:.2f}%'
            .format(epoch + 1, 4, i + 1, total_step, running_loss / 64,(correct / 64) * 100,(val_correct / 64)*100))
        
    

print('Finished Training')

My error excactly is

Traceback (most recent call last):
  File "seresnext_model.py", line 442, in <module>
    loss.backward()
  File 
line 195, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File  line 99, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

I don’t know how the pretrained model is implemented, but these lines of code look a bit wrong:

num_ftrs = model.last_linear.in_features
model = nn.Sequential(*list(model.children())[:-1])
model.fc = nn.Linear(num_ftrs,3)

In the first line you are using model.last_linear.in_features and are assigning a new layer using these in_features to model.fc.
Assuming that last_linear is a real layer and used, this would most likely mean that fc is a new attribute, which is never used.
If that’s the case, assign the new nn.Linear layer to model.last_linear and it should work.

1 Like

That did the trick thank you very much for your help :smiley: You are a savior

I’m getting the same error message and I can’t work out why… I’m using the code on two different computers, one with and one without cuda. The error only happens on the one using the cpu, and everything worked just fine before I added the .to(device).

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.train()
dataset = TensorDataset(Variable(Tensor(trainX)), Variable(Tensor(trainY)))
trainloader = DataLoader(dataset, batch_size=batch_size, pin_memory=True)

for e in range(epochs):
    for idx, (images, labels) in enumerate(trainloader):
        optimizer.zero_grad()
        output = model(images.to(device))
        loss = criterion(output, labels.to(device))
        loss.backward()
        optimizer.step()

The error message is RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn and happens at the line loss.backward()

Usually these error can happen, if you are detaching some tensors from the computation graph as described here. Could you check, if this might be the case here?

Hi I am having same issue but for the EfficientNet not the ResNet can you please help me?from efficientnet_pytorch import EfficientNet
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
model = EfficientNet.from_pretrained(‘efficientnet-b0’)

for param in model.parameters():
param.requires_grad = False

model.classifier_layer = nn.Sequential(
nn.Linear(1280 , 512),
nn.BatchNorm1d(512),
nn.Dropout(0.2),
nn.Linear(512 , 256),
nn.Linear(256 , 2)
)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=1e-4)
model.to(device)

epochs = 300
steps = 0
running_loss = 0
print_every = 10
train_losses, test_losses = [], []

for epoch in range(epochs):
for inputs, labels in trainloader:

    steps += 1
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()
    
    logps = model.forward(inputs)
    loss = criterion(logps, labels)
    
    loss.backward()
    optimizer.step()
    
    running_loss += loss.item()
    
    if steps % print_every == 0:
        test_loss = 0
        accuracy = 0
        model.eval()
        with torch.no_grad():
            for inputs, labels in testloader:
                inputs, labels = inputs.to(device), labels.to(device)
                
                logps = model.forward(inputs)
                batch_loss = criterion(logps, labels)
                test_loss += batch_loss.item()
                
                ps = torch.exp(logps)
                top_p, top_class = ps.topk(1, dim=1)
                equals = top_class == labels.view(*top_class.shape)
                accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        train_losses.append(running_loss/len(trainloader))
        test_losses.append(test_loss/len(testloader))
        
        print(f"Epoch {epoch+1}/{epochs}.. "
              f"Train loss: {running_loss/print_every:.3f}.. "
              f"Test loss: {test_loss/len(testloader):.3f}.. "
              f"Test accuracy: {accuracy/len(testloader):.3f}")
        
        running_loss = 0
        model.train()

torch.save(model, ‘mymodelAA_new.pth’)

Could you check, if model.classifier_layer is set before you are assigning the nn.Sequential container to it?
If not, then note that this layer will never be used and since you are freezing all other parameters of the model, you’ll encounter this error.

thank you for you answer, just now I changed the name “model.classifier_layer” to “model._fc” and it works now, am I fixing it in the right way? could it be because EfficientNet uses _fc rather than classifier_layer?

This sounds reasonable and you could either check it by printing the model (print(model)), which would show all layers and should thus also show the _fc layer or by checking the source code of the implementation, which would show the initialization of all layers as well as their usage.

Thank you seems like it is

)
(_bn1): BatchNorm2d(1280, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
(_avg_pooling): AdaptiveAvgPool2d(output_size=1)
(_dropout): Dropout(p=0.2, inplace=False)
(_fc): Linear(in_features=1280, out_features=1000, bias=True)
(_swish): MemoryEfficientSwish()
)

So yeah now that i use “_fc” it works. thank you .

Hi , sorry I have a question about something like confusion matrix, how can I have something like confusion matrix here? is there any module in torch?

There might be 3rd party libraries built on PyTorch, which could provide an implementation to calculate the confusion matrix, but I would just use common libraries, such as scikit-learn, and pass the predictions as well as targets as numpy arrays to it.
E.g. take a look at sklearn.metrics.confusion_matrix to avoid “reinventing the wheel”. :wink:

@ptrblck
I am facing the same situation.

# Model.
model = my_model()

criterium = nn.MSELoss()

# Adam optimizer with learning rate 0.1 and L2 regularization with weight 1e-4.
optimizer = torch.optim.Adam(model.parameters(),lr=0.1, weight_decay=1e-4)
# Set gradient to 0.
optimizer.zero_grad()

# Feed forward.
pred = model(data)
pred_max = torch.max(pred)
pred_min = torch.min(pred)
pred = 255* (pred - depth_min) / (pred_max - pred_min )

# Loss calculation.
loss = criterium(pred , target)

# Gradient calculation.
loss.backward()

The run stops at the loss. backward().

  1. Seems this issue is caused by detach() from the torch.max().
    For my cause how could I found the max and min value before loss function
  2. Are there other functions will cause detach()?

Thanks

torch.max is not detaching the output values from the computation graph, but the indices.
Your code works fine using random input tensors:

# Model.
model = nn.Linear(1, 1)

criterium = nn.MSELoss()

# Adam optimizer with learning rate 0.1 and L2 regularization with weight 1e-4.
optimizer = torch.optim.Adam(model.parameters(),lr=0.1, weight_decay=1e-4)
# Set gradient to 0.
optimizer.zero_grad()

# Feed forward.
data = torch.randn(1, 1)
pred = model(data)
pred_max = torch.max(pred)
pred_min = torch.min(pred)
depth_min = 1
pred = 255* (pred - depth_min) / (pred_max - pred_min )

# Loss calculation.
target = torch.randn(1, 1)
loss = criterium(pred , target)

# Gradient calculation.
loss.backward()

@ptrblck
Thank you for your quick reply.
Very appreciate it.

I test the code on another machine having a differetn gpu and it works. But the 2080 gpu gives the above issue.
Is something else could cause this?

Different PyTorch versions could have had already fixed issues, but using a GPU wouldn’t change the behavior of Autograd.

Are there other operations may cause a tensor detached from the graph implicitly?

Yes, generally if you “leave” PyTorch, i.e. use another library such as numpy, Autograd won’t be able to track these operations and thus the tensor would be detached (you could write a custom autograd.Function and provide the forward and backward method manually).
Also, operations on integers are not differentiable (such as torch.argmax). Besides that you could check derivatives.yaml for the not_implemented keyword.

1 Like

@ptrblck
In my data prepare function, I want to resize the samples before feeding them to the model for training.
I found if I added the resizer then got:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

But if I commented out the resizer line, it will run successfully. Is there any ops in resizer that the pyTorch doesn’t like?
Thanks.

class BtsDataLoader(object):
    def __init__(self, args, mode):
        if mode == 'train':
            self.training_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
            if args.distributed:
                self.train_sampler = torch.utils.data.distributed.DistributedSampler(self.training_samples)
            else:
                self.train_sampler = None

            self.data = DataLoader(self.training_samples, args.batch_size,
                                   shuffle=(self.train_sampler is None),
                                   num_workers=args.num_threads,
                                   pin_memory=True,
                                   sampler=self.train_sampler)

class DataLoadPreprocess(Dataset):
      ....
      def __getitem__(self, idx):
           image, pattern_gt = self.random_crop(image, depth_gt)
           image, pattern_gt = self.resizer(image, pattern_gt , self.args.input_height, self.args.input_width)
      
      def resizer(self, img, depth, height, width):
        dim = (width, height)
        img = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
        depth = cv2.resize(depth, dim, interpolation = cv2.INTER_AREA)

        depth_show = np.where(depth < 1e-3, depth * 0, depth)
        invalid_mask = (depth_show==0)
        # depth_show = 1/depth_show

        depth_min = np.nanmin(depth_show)
        depth_max = np.nanmax(depth_show)
        bits = 1
        max_val = (2**(8*bits))-1

        if depth_max - depth_min !=0:
            depth_map = max_val * (depth_show - depth_min) / (depth_max - depth_min)
            # depth_map = (depth_show - depth_min) / (depth_max - depth_min)
        else:
            depth_map = 0

        depth_map_show = np.where(invalid_mask, 0, 255 - depth_map.astype('uint8'))

        # cv2.imwrite("after_resize.png", depth_map_show)
        depth = np.expand_dims(depth_map_show, 2)
        return img, depth