Element 0 of tensors does not require grad and does not have a grad_fn

yingjiao_liu · July 20, 2020, 3:11am

Thank you, Sir. You are a life saver.
I modified these lines as you suggested and it does work now without error. But when I evaluate the linear model for speaker identification, the speaker_scores seems to be wrong. I am not sure whether it is related with the error you help me fixed.
As the output of speaker_scores shows below, somehow the first number in each column always is the largest. When I need to get index of the largest score with tensor.argmax, I got zeros all the time. Does the output of speaker_scores look right? Do you have any suggestions here? I really appreciate your help!!!

def evaluate_linear_model(config, apc_model, linear_model, criterion, device, data_loader):
    linear_model.eval()
    losses = []
    with torch.no_grad():
        for batch_x, batch_l, batch_a in data_loader:
            batch_x, batch_l, batch_a = setup_inputs(config, batch_x, batch_l, batch_a, device)
            loss, speaker_scores = pass_inputs_through_model(config, apc_model, linear_model, criterion, batch_x, batch_l,batch_a,device)
            losses.append(loss.item())
            speaker_class_predictions = torch.argmax(speaker_scores, dim=1)
            # print("speaker_scores", speaker_scores)
            # print("speaker_class_predictions",speaker_class_predictions)

            # find positions where we have a correct prediction
            correct_pred = (speaker_class_predictions == batch_a)
            lengths_mask = get_lengths_mask(correct_pred, batch_l, device)
            masked_correct_pred = lengths_mask * correct_pred

            correct = 0  # numerator - keep track of number of errors
            total_speakers = 0  # denominator - keep track of total number of classes
            # get number of correct predictions
            correct += masked_correct_pred.sum().item()  # to_all_ALL is int datatype
            # get total number of frames
            total_speakers += batch_a.size(1)
    ser= 100 * ((total_speakers - correct) / total_speakers)
return ser, losses

def setup_inputs(config, batch_x, batch_l, batch_a, device):

    _, indices = torch.sort(batch_l, descending=True)
    batch_x = Variable(batch_x[indices]).to(device)
    batch_l = Variable(batch_l[indices]).to(device)
    batch_a = Variable(batch_a[indices]).to(device)

    return batch_x, batch_l, batch_a


def pass_inputs_through_model(config, apc_model, linear_model, criterion, batch_x, batch_l, batch_a, device):
    _, internal_rep = apc_model.forward(batch_x, batch_l)  # last RNN layer
    internal_rep = internal_rep.detach()
    speaker_scores = linear_model(internal_rep)
    loss_not_reduced = criterion(speaker_scores, batch_a)

    lengths_mask = get_lengths_mask(loss_not_reduced, batch_l, device)
    loss_not_reduced_masked = lengths_mask * loss_not_reduced
    loss = loss_not_reduced_masked.mean()
    return loss, speaker_scores

Here is the output of speaker_scores (size:[16, 1600, 40]) and speaker_class_predictions(size:[16, 40]) at the first iteration.

speaker_scores tensor([[[ 8.0500e+00,  8.0797e+00,  8.0590e+00,  ...,  7.9894e+00,
           7.9996e+00,  8.0968e+00],
         [ 6.5967e+00,  6.6829e+00,  6.4854e+00,  ...,  6.5828e+00,
           6.5779e+00,  6.6216e+00],
         [ 3.8995e+00,  4.0742e+00,  3.8886e+00,  ...,  3.8872e+00,
           3.9899e+00,  3.9925e+00],
         ...,
         [-3.8165e+00, -3.7231e+00, -3.8339e+00,  ..., -3.5709e+00,
          -3.6196e+00, -3.8674e+00],
         [-3.3267e+00, -3.2683e+00, -3.3269e+00,  ..., -3.0943e+00,
          -3.1706e+00, -3.3947e+00],
         [-3.9622e+00, -3.8175e+00, -3.9419e+00,  ..., -3.6931e+00,
          -3.7702e+00, -4.0186e+00]],

        [[ 6.9792e+00,  6.9930e+00,  6.9780e+00,  ...,  6.9057e+00,
           6.8974e+00,  6.9623e+00],
         [ 6.3637e+00,  6.4508e+00,  6.2607e+00,  ...,  6.2922e+00,
           6.3237e+00,  6.3503e+00],
         [ 4.3157e+00,  4.4286e+00,  4.2575e+00,  ...,  4.2260e+00,
           4.2711e+00,  4.3420e+00],
         ...,
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02]],

        [[ 7.1278e+00,  7.1513e+00,  7.1129e+00,  ...,  7.0446e+00,
           7.0547e+00,  7.1034e+00],
         [ 6.4023e+00,  6.4507e+00,  6.2547e+00,  ...,  6.3206e+00,
           6.3188e+00,  6.3049e+00],
         [ 3.7676e+00,  3.9320e+00,  3.7099e+00,  ...,  3.6738e+00,
           3.7673e+00,  3.8048e+00],
         ...,
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02]],

        ...,

        [[ 7.9684e+00,  7.9946e+00,  7.9834e+00,  ...,  7.9156e+00,
           7.9113e+00,  8.0028e+00],
         [ 6.1155e+00,  6.2138e+00,  6.0246e+00,  ...,  6.1267e+00,
           6.1100e+00,  6.1350e+00],
         [ 3.3304e+00,  3.5016e+00,  3.3533e+00,  ...,  3.3398e+00,
           3.4059e+00,  3.3948e+00],
         ...,
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02]],

        [[ 8.1056e+00,  8.1097e+00,  8.1152e+00,  ...,  8.0462e+00,
           8.0314e+00,  8.1114e+00],
         [ 6.5150e+00,  6.6362e+00,  6.4300e+00,  ...,  6.5414e+00,
           6.4932e+00,  6.5473e+00],
         [ 3.7579e+00,  3.9144e+00,  3.7434e+00,  ...,  3.8008e+00,
           3.8159e+00,  3.8016e+00],
         ...,
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02]],

        [[ 8.3452e+00,  8.3642e+00,  8.3595e+00,  ...,  8.2681e+00,
           8.2715e+00,  8.4071e+00],
         [ 6.8510e+00,  6.9509e+00,  6.7642e+00,  ...,  6.8221e+00,
           6.7819e+00,  6.9030e+00],
         [ 4.1307e+00,  4.2837e+00,  4.1108e+00,  ...,  4.1373e+00,
           4.1427e+00,  4.2269e+00],
         ...,
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02],
         [ 5.4524e-03,  2.4385e-02, -3.9649e-02,  ..., -2.8054e-02,
           2.8421e-03,  3.4010e-02]]])

speaker_class_predictions tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] .....

ptrblck · July 20, 2020, 3:54am

I’m glad to hear the first issue seems to be resolved.
For the zero output: could you try to overfit a small data samples (e.g. just 10 samples) and check, if your model still outputs class0 with the highest probability? This issue might be unrelated to the first one and your model might just overfit this class.

krioux5 · July 23, 2020, 3:20pm

Hello ptrblck,

I think I am doing exactly what you said here and would appreciate some help! This is the code at the end of my model:
…
pre_flattened_outp = self.output(outp17)
outp_test = torch.argmax(pre_flattened_outp.squeeze(),dim=0).detach().cpu().numpy()
my_tensor = torch.from_numpy(outp_test)
my_tensor = my_tensor.unsqueeze(0) # adding dimension for batch (s/b 1,1,572,572)
my_tensor = my_tensor.unsqueeze(0) # adding dimension for batch
my_tensor = my_tensor.type(torch.FloatTensor)
my_tensor=my_tensor.cuda() #cuda expected but got cpu
return my_tensor

I wanted to return from my model the segmentation as a tensor so I can then compare them with my target masks (I was getting incompatible size error when trying to compare as I was returning [1,12,388,388] and trying to perform BCEWithLogitsLoss with tensor of size [1,1,388,388] )

This is the error i’m getting:
File “C:\Users\User\AppData\Roaming\Python\Python37\site-packages\torch\autograd_ init _.py”, line 100, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Thanks

krioux5 · July 23, 2020, 3:25pm

I tried moving these modifications to the training code and out of the model. It used to be in the forward() of my model and now it’s in the forward() of my trainer class, and the issue persists.

ptrblck · July 24, 2020, 2:26am

If you want to calculate the loss using this output tensor and a target and calculate the gradients with it, you must not detach the output, which is currently the case in your code:

outp_test = torch.argmax(pre_flattened_outp.squeeze(),dim=0).detach().cpu().numpy()
my_tensor = torch.from_numpy(outp_test)

Instead use the pre_flattened_outp directly to calculate the loss.

Once this is fixed, you might either want to change the loss functional or fix the target shape.

For a multi-class segmentation (one valid class per pixel) you should use nn.CrossEntropyLoss and the target should have the shape [batch_size, height, width] and contain class indices in the range [0, nb_classes-1].
While for a multi-label segmentation (zero, one or multiple valid classes per pixel), nn.BCEWithLogitsLoss can be used, but the target should have the same output shape as the model output: [batch_size, nb_classes, height, width].

krioux5 · July 24, 2020, 2:43am

Thank you for your response. Currently my U-net returns something of size [batch size, num_channels,height, width], as the original U-net paper indicated in their network architecture

.
For this reason (I think) I am getting the error:
ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: 1only batches of spatial targets supported (3D tensors) but got targets of size: : [1, 1, 388, 388]

Should my model be returning the "[batch_size, height, width] and contain class indices in the range [0, nb_classes-1] " tensor you mentioned?

The only solution I can think of to make that happen involves squeezing to remove the dimension to be [num_chan,height,width]

If it helps, my code can be found here: GitHub - kylerioux/Unet_pytorch: This is a U-net I am writing with the goal of segmenting CT scans. I will adapt it to be a 3D Unet in the future..

I really appreciate the help.

ptrblck · July 24, 2020, 8:18am

The model output is right, if you are dealing with 12 classes.
However, the target has the unnecessary single channel in dim1, so you should remove it via:

target = target.squeeze(1)

Could you rerun the code after this change and check, if it’s working?

krioux5 · July 24, 2020, 11:30am

Training started! My misunderstanding was that I thought any type of squeezing and unsqueezing would interfere with backprop but it only happens when you do that to the Tensor (i assume). Hopefully my network can get some decent results!
Yesterday when I was debugging I kept finding your responses to many questions, so I just wanted to thank you for all the effort you put into these forums. Official documentation can be hard to understand for a beginner, but your responses are always extremely understandable.

krioux5 · July 24, 2020, 12:18pm

Hello,

It appears that loss is not being calculated correctly:

I don’t get any compilation errors, though.
My loss is calculated by the CrossEntropyLoss() function, which I am feeding the tensor I squeezed as well as the output from my network

    def forward(self,inp_images,tar_mask):
        inp_images = inp_images.to(self.device)
        tar_mask = tar_mask.to(self.device)
        inp_images = inp_images.unsqueeze(0) # adding dimension for batch (s/b 1,1,572,572)
        pred_mask = self.net(inp_images)
        # target mask is the one from my preprocessing, pred is what came out of my Unet
        tar_mask=tar_mask.long()#change it to Long type
        tar_mask = tar_mask.squeeze(1) #get ris of one dim of target mask to calculate loss
        loss = self.criterion(pred_mask,tar_mask) #changed from pred_mask to my_tensor
        return loss, pred_mask #changed from pred_mask to my_tensor

criterion in my case is torch.nn.CrossEntropyLoss().

ptrblck · July 25, 2020, 2:09am

The loss calculation looks alright.
Are you “normalizing” it somehow afterwards before printing?

If that’s not the case, could you check the pixel-wise accuracy by comparing preds = torch.argmax(pred_mask, 1) to tar_mask?

krioux5 · July 25, 2020, 4:04am

I don’t think normalizing is the issue? I tried my code using a different dataset as well as a standard U-net, just I used the BCEWithLogitsLoss as it was a binary segmentation problem- and the loss was calculated and printed correctly. I also think it’s actually not being calculated correctly because my dice scores are not improving throughout the epochs.

I added the lines of code you mentioned before I calculate loss and got the following: (printed the outputs)

edit: Do you mean I should be checking those outputs after I’ve let the network train for awhile? The values I just sent you were the first training step so the predictions will be very bad. What is the goal of checking the pixel wise predictions?

Kyle

ptrblck · July 25, 2020, 4:18am

Based on the values and shapes of preds, it seems that your model predicts (mostly?) class4, while the target is (mostly?) class0.
This example shows, that nn.CrossEntropyLoss properly calculates the loss for this case:

output = torch.randn(1, 12, 338, 338)
output[:, 4] += 10
target = torch.zeros(1, 338, 338).long()

criterion = nn.CrossEntropyLoss()
loss = criterion(output, target)
print(loss)

preds = torch.argmax(output, 1)
print(preds)
print(target)

By normalizing I meant to ask, if you are dividing the loss by the number of sample or something like that, which might scale it down.

krioux5 · July 25, 2020, 4:46am

In the iterate() of my Trainer class I have

        for itr,batch in enumerate(dataloader):
            images,mask_target = batch
            loss, pred_mask = self.forward(images,mask_target)
            loss = loss/self.accumulation_steps

But this trainer class worked well in training on my other data set with the other U net architecture. Should I not be dividing the loss in this case, but I should in the other?

I also added the print for loss and got the following (final line represents printing loss)…

ptrblck · July 25, 2020, 4:52am

The last output suggests that the loss is apparently not zero?
I’m not familiar with your use case and the loss division might be right.
However, if a single batch creates a loss of ~2.54, while the epoch printing results in a zero loss, I would assume that the printing method has an error.

krioux5 · July 31, 2020, 2:13pm

So it turns out only the first few iterations have loss, and it later only shows 0 loss forever after that.
printed loss:

Unsurprisingly, the result is that the loss changes on the first epoch then gets stuck and remains the same from that point on.

This set-up worked with a semantic segmentation which was binary and used the BCEwithlogistloss() loss funciton, I kept everything else the same and just used crossentropy due to the multiple labels and am facing this behavior.

Another weird note is that during my other segmentation, each epoch would take far longer (probably 20x longer), although both are utilizing the same batch size, same image input size, input to same network.

joel_ndoumbe · October 6, 2020, 5:08pm

hello i have a problem with my code, and i d

on’t where.!

ptrblck · October 8, 2020, 11:47pm

It seems you are rewrapping the output of the policy_net in a new tensor, which will break the computation graph.
Use the output directly to calculate the loss instead.

PS: you can post code snippets by wrapping them into three backticks ```, which would make debugging easier.

joel_ndoumbe · October 9, 2020, 7:46pm

thanks how to know that our parameters are updated ?

joel_ndoumbe · October 9, 2020, 8:48pm

class DQN(nn.Module):

    def __init__(self, p,k):
           super(DQN, self).__init__()
            
           self.lay1 = nn.Linear(p,150)
           self.ft1 = nn.ReLU()
           self.lay2 = nn.Linear(150,150)
           self.ft2 = nn.ReLU()
           self.lay3 = nn.Linear(150,150)
           self.ft3 = nn.ReLU()
           self.lay4 = nn.Linear(150,k)
            #self.ft4 = nn.ReLU()

    
    def forward(self, x):
          x = self.ft1(self.lay1(x))
          x = self.ft2(self.lay2(x))
          x = self.ft3(self.lay3(x))
          x = self.lay4(x)
          return x.view(x.size(0), -1)

GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 500
TARGET_UPDATE = 10


# Get number of actions from gym action space
n_actions = env.nb_paths*env.nb_block
n_observations = (env.nb_block+env.nb_slot)*env.nb_paths

policy_net = DQN(n_observations,n_actions).to(device)   
target_net = DQN(n_observations,n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

steps_done = 0
Q_value=[]
Ver_Q_value=[]
set_actions=list(np.arange(env.nb_paths*env.nb_block))
def get_q_value(observation, mask):
    
    Q_value=policy_net(observation).tolist()
    position=[i for i in range(len(set_actions)) if not mask[i]]
    Ver_Q_value =[Q_value[i] for i in range(len(set_actions)) if not mask[i]]
    return Ver_Q_value, position
    
def select_action(observation,mask):
    global steps_done
    available_action=[set_actions[i] for i in range(len(set_actions)) if not mask[i]]
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1

    if sample > eps_threshold:
        with torch.no_grad():
             Q_value1, position = get_q_value(observation, mask)
             Q_value= torch.Tensor(Q_value1)
             k =  torch.argmax(Q_value)
             return  position[k]
    else:
        return random.sample(available_action,1)[0]

learning_rate =0.01
replay2 = collections.deque()
criterion = nn.MSELoss()
optimizer = optim.Adam(policy_net.parameters(),lr=learning_rate)
optimizer_target = optim.Adam(target_net.parameters(),lr=learning_rate)
parametre = [param for param in policy_net.parameters()]

class R(object):
    def __init__(self):
        self.states = []
        self.next_stat =[]
        self.rewarde = []
        self.actions=[]
    def __len__(self):
        return len(self.actions)
state =[]
rewr = []
action = []
def optimize_model(replay):
    if len(replay) < 100:
        return
    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          replay.next_stat)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in replay.next_stat
                                                if s is not None])
    
    state_batch = torch.cat(replay.states)
    action_batch = torch.cat(replay.actions)
    reward_batch = torch.cat(replay.rewarde)
    s = np.array([int(elt) for elt in action_batch])
    Q = policy_net(state_batch)
    state_action_values2 = Q[0][s]
    
    next_state_values = torch.zeros(len(action_batch), device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch/100

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values2, expected_state_action_values)
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    return loss

loss = 0
reward_epoch = []
result_final = []
losses_target = []
losses_policy = []

def test2(start):
    # Set up lists to hold results
    total_rewards_eps = []
    #initialise data and system
    nb_dataset=10
    nb_epoch = 1
  
    policy_net.train()
    for epoch in range(nb_epoch):
        env.reset()
        reward_batch = []
        
        for batch in range(nb_dataset):

            rewards = []
            replay = R()
            #get the initial state
            current_state ,mask= env.get_state(batch) # the mask is important to know the feasible actions
            
            state = current_state
            
            count_iteration = 0
            # main infinite loop
            while True:
                available_action=[set_actions[i] for i in range(len(set_actions)) if not mask[i]]
                if len(available_action)==0:
                    action_index="impossible"
                else:
                    action_index = select_action(torch.Tensor(state),mask)
                # get next state and reward
                next_state, mask_1, reward, terminal = env.act(action_index,batch)
                count_iteration+=1
                rewards.append(reward)
                if action_index == "impossible":
                    action_index = random.randint(0, n_actions-1)
                    
                replay.states.append(torch.Tensor([state]))
                replay.next_stat.append(torch.Tensor([next_state]))
                replay.rewarde.append(torch.Tensor([reward]))
                replay.actions.append(torch.Tensor([action_index]))

                if not terminal:
                    state = next_state
                    mask = mask_1
                    
                else:
                    total_rewards_eps.append(sum(rewards))
                    result_final.append(env.result_final)
                    loss = optimize_model(replay)
                    losses_policy.append(loss)
                    print(loss)
                    print(sum(rewards))
                    
                    break
                if len(replay)%200 == 0:
                    target_net.load_state_dict(policy_net.state_dict())

        reward_batch.append(total_rewards_eps)
        
    reward_epoch.append(reward_batch)  
    
    val_reward= np.mean(total_rewards_eps)
    print(val_reward)
    print ("elapsed time:", time.time() - start)
    
    return result_final,val_reward,losses_policy

ptrblck · October 10, 2020, 3:47am

After fixing the initial error message you could check, if all parameters get gradients in their .grad attribute after the backward() call or you could create a deepcopy of the state_dict before the optimizer.step() and compare it to the state_dict afterwards.