Derivative of model's output with respect to its input

I am trying to calculate the higher order gradients of neural networks outputs wrt to inputs.
I have trained the neural network for approximating the function y = x^2. After that I calculate the 1st derivative below and I get the result as expected that is 2*x. But when I calculate the seconds derivative I am getting all 0’s (2 is expected). Can anyone please explain what am I doing wrong?

#%%
import pandas as pd
import numpy as np
from scipy.stats import norm
#%%
# Create dataset
N = 10000
data = pd.DataFrame(np.zeros((N, 2)), columns= ["x", "y"])

data["x"] = np.random.uniform(-50, 50, N)
data["y"] = data["x"]**2

#%%
import torch
import torch.nn as nn
#%%
X_train = data["x"]
y_train = data["y"]
#%%

X_train = X_train.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)
#%%
## Convert to tensors
n_train = X_train.shape[0]
X_train = torch.tensor(X_train, dtype=torch.float).view(-1,1)
y_train = torch.tensor(y_train, dtype=torch.float).view(-1, 1)
X_train.shape
y_train.shape
#%%
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#%%
# Hyper-parameters 
input_size = 1
hidden_size = 64
num_epochs = 100
batch_size = 250

learning_rate = 10e-3    #0.001

#%%
# Data loader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
#%%
train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)


#%%
# Fully connected neural network with 2 hidden layer

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.last = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.last(out)
        out = self.relu(out)
        return(out)
        
    def u(self, features):
        u = self.forward(features)
        u.requires_grad_(True)
        return(u)
        
        

#%%
model = NeuralNet(input_size, hidden_size).to(device)
model
#%%
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

#%%
training_loss = []
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (features, price) in enumerate(train_loader):
        # features and price come in batches
        # print("f: ", features)
        # print("p: ", price)
        # break
        # Move tensors to the configured device
        features = features.to(device)
        features.requires_grad_(True)
        price = price.to(device)
        
        # Forward pass
        output = model(features.float())
        loss = criterion(output, price)
                        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        training_loss.append(loss.item())
        
        # break
        if (i+1) % 10 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.8f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
            
    # break


#%%
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
#%%

# make predictions with the model
X_train.requires_grad_(True)
# yhat = model(X_train)
yhat = model.u(X_train)
#%%
print("Train")

print(criterion(y_train, yhat))

######################## Calculation of Derivative #################
#%%
deriv = torch.autograd.grad(outputs = [yhat], inputs = [X_train], grad_outputs = torch.ones_like(yhat) ,
                                  allow_unused=True, retain_graph=True, create_graph=True)[0]
# deriv.requires_grad_(True)
deriv
#%%
# 2nd derivative
deriv2 = torch.autograd.grad(outputs = [deriv], inputs = [X_train], grad_outputs = torch.ones_like(yhat) ,
                                  allow_unused=True, retain_graph=True, create_graph=True)[0]
deriv2
#%%
data["Model"]= yhat.detach().numpy()
# Derivative of x
data["dx"] = 2* data["x"] 

#%%
# data["model_dx"] = deriv.detach().numpy()

#%%
# data["model_dx2"] = deriv2.detach().numpy()
1 Like

Hi Sanketabh

You are using the ReLu Function as activation function. The first derivative of this function is 1 if x is greater than one and 0 else. However, the second derivative of this function is always zero. Hence, the second derivative you have calculated is zero too. Try to use another activation function which is twice differentiable such as the sigmoid or tanh.

1 Like