Derivative of model's output with respect to its input

I am trying to calculate the higher order gradients of neural networks outputs wrt to inputs.
I have trained the neural network for approximating the function y = x^2. After that I calculate the 1st derivative below and I get the result as expected that is 2*x. But when I calculate the seconds derivative I am getting all 0’s (2 is expected). Can anyone please explain what am I doing wrong?

#%%
import pandas as pd
import numpy as np
from scipy.stats import norm
#%%
# Create dataset
N = 10000
data = pd.DataFrame(np.zeros((N, 2)), columns= ["x", "y"])

data["x"] = np.random.uniform(-50, 50, N)
data["y"] = data["x"]**2

#%%
import torch
import torch.nn as nn
#%%
X_train = data["x"]
y_train = data["y"]
#%%

X_train = X_train.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)
#%%
## Convert to tensors
n_train = X_train.shape[0]
X_train = torch.tensor(X_train, dtype=torch.float).view(-1,1)
y_train = torch.tensor(y_train, dtype=torch.float).view(-1, 1)
X_train.shape
y_train.shape
#%%
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#%%
# Hyper-parameters 
input_size = 1
hidden_size = 64
num_epochs = 100
batch_size = 250

learning_rate = 10e-3    #0.001

#%%
# Data loader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
#%%
train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)


#%%
# Fully connected neural network with 2 hidden layer

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.last = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.last(out)
        out = self.relu(out)
        return(out)
        
    def u(self, features):
        u = self.forward(features)
        u.requires_grad_(True)
        return(u)
        
        

#%%
model = NeuralNet(input_size, hidden_size).to(device)
model
#%%
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

#%%
training_loss = []
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (features, price) in enumerate(train_loader):
        # features and price come in batches
        # print("f: ", features)
        # print("p: ", price)
        # break
        # Move tensors to the configured device
        features = features.to(device)
        features.requires_grad_(True)
        price = price.to(device)
        
        # Forward pass
        output = model(features.float())
        loss = criterion(output, price)
                        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        training_loss.append(loss.item())
        
        # break
        if (i+1) % 10 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.8f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
            
    # break


#%%
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
#%%

# make predictions with the model
X_train.requires_grad_(True)
# yhat = model(X_train)
yhat = model.u(X_train)
#%%
print("Train")

print(criterion(y_train, yhat))

######################## Calculation of Derivative #################
#%%
deriv = torch.autograd.grad(outputs = [yhat], inputs = [X_train], grad_outputs = torch.ones_like(yhat) ,
                                  allow_unused=True, retain_graph=True, create_graph=True)[0]
# deriv.requires_grad_(True)
deriv
#%%
# 2nd derivative
deriv2 = torch.autograd.grad(outputs = [deriv], inputs = [X_train], grad_outputs = torch.ones_like(yhat) ,
                                  allow_unused=True, retain_graph=True, create_graph=True)[0]
deriv2
#%%
data["Model"]= yhat.detach().numpy()
# Derivative of x
data["dx"] = 2* data["x"] 

#%%
# data["model_dx"] = deriv.detach().numpy()

#%%
# data["model_dx2"] = deriv2.detach().numpy()