Getting gradient w.r.t input which contains concatenation of some tensors

Hi everyone.
My code is in the following. The input size of the model is 8 which is the concatenation of two tensors with the size of 5 and 3, I want to do just one forward on the model and one backward to get two gradients. One is w.r.t the input with size 5 (first part of concatenation) and the other one is w.r.t the input with size 3 (second part of concatenation).
Batch_size = 5

I am new in PyTorch and appreciate any help from you side.

class Net(nn.Module):
    """
    A simple multilayer perceptron with one hidden layer.
    """
    def __init__(self, num_input, num_hidden, num_output, dropout,
                 activation='tanh'):
        super(Net, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(num_input, num_hidden)
        self.fc2 = nn.Linear(num_hidden, num_output)

        if activation == 'tanh':
            self.activation_f = torch.tanh
        elif activation == 'relu':
            self.activation_f = torch.relu

    def forward(self, x):
        x = self.activation_f(self.fc1(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))
        return x

model = Net(num_input=8, num_hidden=4, num_output=2, dropout=0.0, activation='tanh')

for batch_idx, (data, label) in enumerate(data_loader): 

    data = data.clone().detach().requires_grad_(True)
    print('data:', data)
    x1 = data[:,:5]
    x1 = x1.clone().detach().requires_grad_(True)
    x2 = x1.clone().detach().requires_grad_(True)
    x2 = data[:, 5:8]
    print('x1:', x1)
    print('x2:',x2)


    output= model(data)
    print('output:', output)
    loss = loss_fn(pred=output, target=label)
    print('loss:', loss)
    grad_tensor_1 = torch.autograd.grad(outputs=loss, inputs=x1, allow_unused=True)
    print('gradient of x1: ', grad_tensor_1)
    grad_tensor_2 = torch.autograd.grad(outputs=loss, inputs=x2, allow_unused=True)
    print('gradient of x2: ', grad_tensor_2)

    loss.backward()
    optimizer.step()

I got the following output:

label tensor([1, 0, 1, 0, 0])
data: tensor([[0.4777, 0.5840, 0.5885, 0.5624, 0.5781, 0.5119, 0.5466, 0.3983],
        [0.6305, 0.6040, 0.4076, 0.5833, 0.4156, 0.5065, 0.4737, 0.4937],
        [0.5168, 0.5520, 0.6126, 0.5375, 0.5495, 0.4984, 0.4365, 0.4092],
        [0.4785, 0.5723, 0.5018, 0.5540, 0.4677, 0.5447, 0.3886, 0.5680],
        [0.5685, 0.6240, 0.3949, 0.5555, 0.4099, 0.5065, 0.4737, 0.4937]],
       requires_grad=True)
x1: tensor([[0.4777, 0.5840, 0.5885, 0.5624, 0.5781],
        [0.6305, 0.6040, 0.4076, 0.5833, 0.4156],
        [0.5168, 0.5520, 0.6126, 0.5375, 0.5495],
        [0.4785, 0.5723, 0.5018, 0.5540, 0.4677],
        [0.5685, 0.6240, 0.3949, 0.5555, 0.4099]], requires_grad=True)
x2: tensor([[0.5119, 0.5466, 0.3983],
        [0.5065, 0.4737, 0.4937],
        [0.4984, 0.4365, 0.4092],
        [0.5447, 0.3886, 0.5680],
        [0.5065, 0.4737, 0.4937]], grad_fn=<SliceBackward>)
output: tensor([[0.5724, 0.3881],
        [0.5688, 0.3966],
        [0.5680, 0.3904],
        [0.5701, 0.3887],
        [0.5696, 0.3944]], grad_fn=<SigmoidBackward>)
loss: tensor(0.6804, grad_fn=<NllLossBackward>)
gradient w.r.t x1:  (None,)
gradient w.r.t x2:  (None,)

How can I get the gradient w.r.t x1 and x2?

I got the gradient of loss w.r.t the input data as following:

grad = torch.autograd.grad(outputs=loss, inputs=data, allow_unused=True)

output:

 (tensor([[ 0.0073, -0.0088, -0.0019, -0.0110, -0.0023,  0.0060,  0.0012, -0.0022],
        [-0.0086,  0.0103,  0.0022,  0.0130,  0.0027, -0.0069, -0.0013,  0.0026],
        [-0.0086,  0.0103,  0.0021,  0.0131,  0.0026, -0.0070, -0.0013,  0.0026],
        [ 0.0072, -0.0086, -0.0019, -0.0109, -0.0023,  0.0058,  0.0012, -0.0021],
        [-0.0086,  0.0103,  0.0021,  0.0131,  0.0027, -0.0070, -0.0013,  0.0026]]),)

which is a tuple that I can divide it into two parts for getting two gradients one is w.r.t x1 and the other one is w.r.t x2.
I want to know is it a right idea?
On the other hand, I want to update this model as well.
Does the following code work?

for batch_idx, (data, label) in enumerate(data_loader_new): 
    optimizer.zero_grad()
    data = data.clone().detach().requires_grad_(True)
    output= model(data)
    loss = loss_fn(pred=output, target=label)
    grad = torch.autograd.grad(outputs=loss, inputs=data, allow_unused=True)
    optimizer.step() 

and is it any difference between the last code and the following for updating the model?

for batch_idx, (data, label) in enumerate(data_loader_new): 
    optimizer.zero_grad()
    data = data.clone().detach().requires_grad_(True)
    output= model(data)
    loss = loss_fn(pred=output, target=label)
    loss.backward()
    optimizer.step()