Runtime Error - element 0 of tensors does not require grad and does not have a grad_fn

I am trying to recreate CNN model from keras and facing runtime error on my loss.bakcward()
I have check the model summary and saw summary was running fine with its channels and tensor, however due to my lack of knowledge I can not locate the bug. Can anyone give me an advice?

Then model :

from typing import List
class DNA_CNN_test2(nn.Module): # deepcre model  
    def __init__(self,
                 seq_len: int =1000,
                 #num_filters: List[int] = [64, 128, 64],
                 kernel_size: int = 8,
                 p = 0.25): # drop out value 
        super().__init__()
        self.seq_len = seq_len
       
        window_size = int(seq_len*(8/3000))
        # CNN module
        self.conv_net = nn.Sequential() # sequential containter. the forward() method of sequential accepts cany input and forwards it to yhe first module it contains 
        #num_filters = [4] + num_filters
     
        self.model = nn.Sequential(
            # conv block 1
            nn.Conv1d(4,64,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True), 
            nn.Conv1d(64,64,kernel_size=kernel_size, padding='same'), 
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=window_size),
            nn.Dropout(p),
            # conv block 2
            nn.Conv1d(64,128,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.Conv1d(128,128,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=window_size),
            nn.Dropout(p),
            # conv block 3
            nn.Conv1d(128,64,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.Conv1d(64,64,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=window_size),
            nn.Dropout(p),
            nn.Flatten(),
        
            nn.Linear(64*(seq_len//window_size**3), 1))
            #nn.ReLU(inplace=True),
            #nn.Dropout(p),
            #nn.Linear(128, 64),
            #nn.ReLU(inplace=True),
            #nn.Linear(64*seq_len, 1))

    def forward(self, xb: torch.Tensor):
        """Forward pass."""
        # reshape view to batch_ssize x 4channel x seq_len
        # permute to put channel in correct order
        means (batch size, 4 channel - OHE(DNA), Seq.length  )
            
        xb = xb.permute(0, 2, 1).mean( dim = [1,2], keepdim = True).squeeze(dim= -1)
        out = self.conv_net(xb)
        return out 

loss_batch,train and test step

# +--------------------------------+
# | Training and fitting functions |
# +--------------------------------+

def loss_batch(model, loss_func, xb, yb, opt=None,verbose=False):
    '''
    Apply loss function to a batch of inputs. If no optimizer
    is provided, skip the back prop step.
    '''
    if verbose:
        print('loss batch ****')
        print("xb shape:",xb.shape)
        print("yb shape:",yb.shape)
        print("yb shape:",yb.squeeze(1).shape)
        #print("yb",yb)

    # get the batch output from the model given your input batch 
    # ** This is the model's prediction for the y labels! **
    xb_out = model(xb.float())
    
    if verbose:
        print("model out pre loss", xb_out.shape)
        #print('xb_out', xb_out)
        print("xb_out:",xb_out.shape)
        print("yb:",yb.shape)
        print("yb.long:",yb.long().shape)
    
    loss = loss_func(xb_out, yb.float()) # for MSE/regression
    # __FOOTNOTE 2__
    
    if opt is not None: # if opt
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def train_step(model, train_dl, loss_func, device, opt):
    '''
    Execute 1 set of batched training within an epoch
    '''
    # Set model to Training mode
    model.train()
    tl = [] # train losses
    ns = [] # batch sizes, n
    
    # loop through train DataLoader
    for xb, yb in train_dl:
        # put on GPU
        xb, yb = xb.to(device),yb.to(device)
        
        # provide opt so backprop happens
        t, n = loss_batch(model, loss_func, xb, yb, opt=opt)
        
        # collect train loss and batch sizes
        tl.append(t)
        ns.append(n)
    
    # average the losses over all batches    
    train_loss = np.sum(np.multiply(tl, ns)) / np.sum(ns)
    
    return train_loss

def val_step(model, val_dl, loss_func, device):
    '''
    Execute 1 set of batched validation within an epoch
    '''
    # Set model to Evaluation mode
    model.eval()
    with torch.no_grad():
        vl = [] # val losses
        ns = [] # batch sizes, n
        
        # loop through validation DataLoader
        for xb, yb in val_dl:
            # put on GPU
            xb, yb = xb.to(device),yb.to(device)

            # Do NOT provide opt here, so backprop does not happen
            v, n = loss_batch(model, loss_func, xb, yb)

            # collect val loss and batch sizes
            vl.append(v)
            ns.append(n)

    # average the losses over all batches
    val_loss = np.sum(np.multiply(vl, ns)) / np.sum(ns)
    
    return val_loss


def fit(epochs, model, loss_func, opt, train_dl, val_dl,device,patience=1000):
    '''
    Fit the model params to the training data, eval on unseen data.
    Loop for a number of epochs and keep train of train and val losses 
    along the way
    '''
    # keep track of losses
    train_losses = []    
    val_losses = []
    
    # loop through epochs
    for epoch in range(epochs):
        # take a training step
        train_loss = train_step(model,train_dl,loss_func,device,opt)
        train_losses.append(train_loss)

        # take a validation step
        val_loss = val_step(model,val_dl,loss_func,device)
        val_losses.append(val_loss)
        
        print(f"E{epoch} | train loss: {train_loss:.3f} | val loss: {val_loss:.3f}")

    return train_losses, val_losses


def run_model(train_dl,val_dl,model,device,
              lr=1e-2, epochs=50, 
              lossf=None,opt=None
             ):
    '''
    Given train and val DataLoaders and a NN model, fit the mode to the training
    data. By default, use MSE loss and an SGD optimizer
    '''
    # define optimizer
    if opt:
        optimizer = opt
    else: # if no opt provided, just use SGD
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    # define loss function
    if lossf:
        loss_func = lossf
    else: # if no loss function provided, just use MSE
        loss_func = torch.nn.MSELoss()
    
    # run the training loop
    train_losses, val_losses = fit(
                                epochs, 
                                model, 
                                loss_func, 
                                optimizer, 
                                train_dl, 
                                val_dl, 
                                device)

    return train_losses, val_losses

Error:

RuntimeError                              Traceback (most recent call last)
Cell In[51], line 5
      2 DNA_CNN_test2 = DNA_CNN_test2(seq_len)
      3 DNA_CNN_test2.to(device)
----> 5 DNA_CNN_test2_train_losses_lr4, DNA_CNN_test2_val_losses_lr4 = run_model(
      6     train_dl, 
      7     val_dl, 
      8     DNA_CNN_test2,
      9     device,
     10     epochs=100,
     11     lr= 1e-2
     12 )

Cell In[42], line 139, in run_model(train_dl, val_dl, model, device, lr, epochs, lossf, opt)
    136     loss_func = torch.nn.MSELoss()
    138 # run the training loop
--> 139 train_losses, val_losses = fit(
    140                             epochs, 
    141                             model, 
    142                             loss_func, 
    143                             optimizer, 
    144                             train_dl, 
    145                             val_dl, 
    146                             device)
    148 return train_losses, val_losses

Cell In[42], line 106, in fit(epochs, model, loss_func, opt, train_dl, val_dl, device, patience)
    103 # loop through epochs
    104 for epoch in range(epochs):
    105     # take a training step
--> 106     train_loss = train_step(model,train_dl,loss_func,device,opt)
    107     train_losses.append(train_loss)
    109     # take a validation step

Cell In[42], line 54, in train_step(model, train_dl, loss_func, device, opt)
     51 xb, yb = xb.to(device),yb.to(device)
     53 # provide opt so backprop happens
---> 54 t, n = loss_batch(model, loss_func, xb, yb, opt=opt)
     56 # collect train loss and batch sizes
     57 tl.append(t)

Cell In[42], line 32, in loss_batch(model, loss_func, xb, yb, opt, verbose)
     29 # __FOOTNOTE 2__
     31 if opt is not None: # if opt
---> 32     loss.backward()
     33     opt.step()
     34     opt.zero_grad()

File /mnt/biostat/environments/parkj/dna2rna/lib/python3.11/site-packages/torch/_tensor.py:487, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    477 if has_torch_function_unary(self):
    478     return handle_torch_function(
    479         Tensor.backward,
    480         (self,),
   (...)
    485         inputs=inputs,
    486     )
--> 487 torch.autograd.backward(
    488     self, gradient, retain_graph, create_graph, inputs=inputs
    489 )

File /mnt/biostat/environments/parkj/dna2rna/lib/python3.11/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    195     retain_graph = create_graph
    197 # The reason we repeat same the comment below is that
    198 # some Python versions print out the first line of a multi-line function
    199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202     allow_unreachable=True, accumulate_grad=True)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Double post from here.

will delete here thank you for the reminder