I am trying to calculate the higher order gradients of neural networks outputs wrt to inputs.
I have trained the neural network for approximating the function y = x^2. After that I calculate the 1st derivative below and I get the result as expected that is 2*x. But when I calculate the seconds derivative I am getting all 0’s (2 is expected). Can anyone please explain what am I doing wrong?
#%%
import pandas as pd
import numpy as np
from scipy.stats import norm
#%%
# Create dataset
N = 10000
data = pd.DataFrame(np.zeros((N, 2)), columns= ["x", "y"])
data["x"] = np.random.uniform(-50, 50, N)
data["y"] = data["x"]**2
#%%
import torch
import torch.nn as nn
#%%
X_train = data["x"]
y_train = data["y"]
#%%
X_train = X_train.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)
#%%
## Convert to tensors
n_train = X_train.shape[0]
X_train = torch.tensor(X_train, dtype=torch.float).view(-1,1)
y_train = torch.tensor(y_train, dtype=torch.float).view(-1, 1)
X_train.shape
y_train.shape
#%%
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#%%
# Hyper-parameters
input_size = 1
hidden_size = 64
num_epochs = 100
batch_size = 250
learning_rate = 10e-3 #0.001
#%%
# Data loader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
#%%
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
batch_size=batch_size,
shuffle=False)
#%%
# Fully connected neural network with 2 hidden layer
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.last = nn.Linear(hidden_size, 1)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out = self.relu(out)
out = self.last(out)
out = self.relu(out)
return(out)
def u(self, features):
u = self.forward(features)
u.requires_grad_(True)
return(u)
#%%
model = NeuralNet(input_size, hidden_size).to(device)
model
#%%
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#%%
training_loss = []
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (features, price) in enumerate(train_loader):
# features and price come in batches
# print("f: ", features)
# print("p: ", price)
# break
# Move tensors to the configured device
features = features.to(device)
features.requires_grad_(True)
price = price.to(device)
# Forward pass
output = model(features.float())
loss = criterion(output, price)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
training_loss.append(loss.item())
# break
if (i+1) % 10 == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.8f}'
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
# break
#%%
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
#%%
# make predictions with the model
X_train.requires_grad_(True)
# yhat = model(X_train)
yhat = model.u(X_train)
#%%
print("Train")
print(criterion(y_train, yhat))
######################## Calculation of Derivative #################
#%%
deriv = torch.autograd.grad(outputs = [yhat], inputs = [X_train], grad_outputs = torch.ones_like(yhat) ,
allow_unused=True, retain_graph=True, create_graph=True)[0]
# deriv.requires_grad_(True)
deriv
#%%
# 2nd derivative
deriv2 = torch.autograd.grad(outputs = [deriv], inputs = [X_train], grad_outputs = torch.ones_like(yhat) ,
allow_unused=True, retain_graph=True, create_graph=True)[0]
deriv2
#%%
data["Model"]= yhat.detach().numpy()
# Derivative of x
data["dx"] = 2* data["x"]
#%%
# data["model_dx"] = deriv.detach().numpy()
#%%
# data["model_dx2"] = deriv2.detach().numpy()