X_input.grad.data is NoneType

I am trying to include nn.embedding for my transformer model but experienced x_input.grad.data is NoneType error for x_grad = torch.sign(x_input.grad.data) in my create_augmented_data function.
Here is my code for my model:

class MLP(torch.nn.Module):
    def __init__(self, num_fc_layers, num_fc_units, dropout_rate, dim_num_heads):
      super().__init__()
      if num_fc_units % dim_num_heads != 0:
         num_fc_units = num_fc_units//dim_num_heads * dim_num_heads
      embed_dim = num_fc_units
      self.embedding = nn.Embedding(998, embed_dim).requires_grad_(True)
      self.transformer_encoder_layer = nn.TransformerEncoderLayer(
          d_model=embed_dim, 
          nhead=dim_num_heads,  # Number of heads in the multiheadattention models
          dim_feedforward=embed_dim,
          dropout=0.1,
          activation='relu'
      )
      self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=num_fc_layers)
      embed_dim = num_fc_units
      self.layers = nn.ModuleList()      
      self.layers.append(nn.Linear(embed_dim, embed_dim))
      self.layers.append(nn.ReLU(True))
      self.layers.append(nn.Dropout(p=dropout_rate))
      for i in range(num_fc_layers):
        self.layers.append(nn.Linear(embed_dim, embed_dim))
        self.layers.append(nn.ReLU(True))
        self.layers.append(nn.Dropout(p=dropout_rate))
      self.output_layer = (nn.Linear(embed_dim, 24))
    def forward(self, x): 
      x = x.long()
      x = self.embedding(x)
      x = torch.transpose(x, 0, 1)
      x = self.transformer_encoder(x)
      x = torch.transpose(x, 0, 1)
      x = torch.mean(x, dim=1)
      for i in range(len(self.layers)): 
        x = self.layers[i](x)
      x = self.output_layer(x)
      return x

Code for create_augmented_data function:

def create_augmented_data(x_train, y_train, eps, model): 
    x_input = torch.from_numpy(x_train).float()
    y_true = torch.from_numpy(y_train).long()
    x_input = x_input.to(device)
    y_true = y_true.to(device)
    x_input = Variable(x_input, requires_grad=True)
    y_true = Variable(y_true)
    train_outputs = model(x_input)
    for name, param in model.named_parameters():
        if param.grad == None:
            print(name, 'is None')
        else:
            print('param {}: {}'.format(name, param.grad.abs().sum()))
    ad_loss = torch.nn.CrossEntropyLoss()
    loss_cal = ad_loss(train_outputs, y_true)
    loss_cal.backward(retain_graph=True)
    x_grad = torch.sign(x_input.grad.data)
    x_adversarial = x_input + eps * x_grad 
    x_aug = torch.cat([x_input, x_adversarial], dim=0)
    y_aug = torch.cat([y_true, y_true], dim=0)
    return x_aug, y_aug

Code for training:

model = MLP(num_fc_layers, num_fc_units, dropout_rate, num_heads).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    for epoch in range(2): 
        # print("EPOCHS: {0}".format(epoch))
        train_losses = []
        correct = 0
        loss = 0 
        total_predictions = 0
        model.train()
        for i, (x,y) in enumerate(train_loader): 
            x_input, y_input = x.numpy(), y.numpy()
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            model.eval()
            x_aug, y_aug = create_augmented_data(x_input, y_input, eps=1.0, model=model)
            x_aug, y_aug = x_aug.to(device), y_aug.to(device)
            model.train()
            output = model(x_aug)
            loss = criterion(output, y_aug)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            pred = output.max(1, keepdim=True)[1]
            correct+= pred.eq(y_aug.view_as(pred)).sum().item()
            total_predictions += y_aug.size(0)

Attempt that I made to solve the problem include:

  1. Print for each step to spot where the problem is coming from, but everything seems to be fine up to this line x_grad = torch.sign(x_input.grad.data), print(x_input.grad) before this line output None.
Variable: x_input.shape: torch.Size([32, 150])
Variable: y_true.shape: torch.Size([32])
train_outputs.shape: torch.Size([32, 24])
ad_loss: CrossEntropyLoss()
loss_cal: tensor(3.1756, grad_fn=<NllLossBackward0>)
x_input.grad None
  1. Print .require_grad for each model parameters after train_outputs = model(x_input), all parameters’ requires_grad values is True
for name, param in model.named_parameters():
    print(name, param.requires_grad)
  1. Check if x_input is not a leaf tensor or if there are any other computation operation in between
# include this two line after loss_cal.backward(retain_graph=True)
print(x_input.grad_fn) # return None
print(x_input.is_leaf) # return True
  1. Check for .grad for all parameters, all param.grad is None.
for name, param in model.named_parameters():
        if param.grad == None:
            print(name, 'is None')
        else:
            print('param {}: {}'.format(name, param.grad.abs().sum()))

I am new to transformer and is lost in this situation. Could you please help me explain what might be the potential reasons causing the AttributeError: ‘NoneType’ object has no attribute ‘data’ for x_grad = torch.sign(x_input.grad.data)

Thank you so much for your help!

Double post from here with a follow-up.