Add gradient of inputs into the loss function?

Hi! I want to use the ICNN(input convex neural network) to work as a integral function F(x).
And get the derivative function f(x) of ICNN to approximate a line(y=ax+b)
My intuition is: in my research I want to approximate a monotone function and a convex function’s derivative must be monotone increasing so maybe using ICNN can get a global minima.
For implementation, I used autograd to get the gradients of output with respect to the input

integral_x1=net(x0)
        net.zero_grad()
        integral_x1[:,0].backward(torch.ones(batch_size).to(device),retain_graph=True) 
        x1=x0.grad

And take the MSE loss between labels and x1. Then I do the Adam update.
But this ICNN can’t approximate the integral of an affine funtion.
Does any one tried combining the gradient of a neural network into the loss and do Adam or SGD?

# Last modified: Feb 25 2020 by Jiaojiao Fan
# What's this filed used for?
# I am testing: using the gradient to combine into the loss function
# And see whether this can approximate a line from dataInitial_line.npy


from network_initial import fNet
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  
import matplotlib.pyplot as plt

trainset=np.load('./data/dataInitial_line.npy')
trainset=torch.from_numpy(trainset).float()
trainloader=DataLoader(trainset,batch_size=1000,
                        shuffle=True,num_workers=0) 
batch_size=1000
net=fNet()
net.to(device)
ws_loss=nn.MSELoss()
# optimizer=optim.SGD(net.parameters(),lr=0.00001,momentum=.999)
optimizer=optim.Adam(net.parameters(), lr=0.0001)

for epoch in range(20):
    running_loss=0.0
    for i,data in enumerate(trainloader,0):

        x0=data[:,0]
        x0=x0.to(device)
        x0=Variable(x0,requires_grad=True)
        labels=data[:,1]
        labels=labels.to(device)

        integral_x1=net(x0)
        net.zero_grad()
        integral_x1[:,0].backward(torch.ones(batch_size).to(device),retain_graph=True) 
        x1=x0.grad
        x1=Variable(x1,requires_grad=True) #This is added

        loss = ws_loss(labels,x1)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # clamp some constraints for Fnet
        with torch.no_grad():
            net.fc1.weight.masked_scatter_(net.fc1.weight<0,torch.zeros(net.fc1.weight.shape).to(device))
            net.fc2.weight.masked_scatter_(net.fc2.weight<0,torch.zeros(net.fc2.weight.shape).to(device))

        running_loss+=loss.item()
        if i%50==49:
            print('[%2d %5d] loss:%.3f' %
            (epoch+1,i+1,running_loss/50))
            running_loss=0.0

What’s more, this is my network structure:

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

class fNet(nn.Module):
    def __init__(self):
        super(fNet,self).__init__()
        self.fc0=nn.Linear(1,300)

        self.fc1_i=nn.Linear(1,300,bias=False)
        self.fc1=nn.Linear(300,300)

        self.fc2_i=nn.Linear(1,1,bias=False)
        self.fc2=nn.Linear(300,1)
    def forward(self,x):
        """
         Args:
             self: the Net's layers
             x:x0
        """
        x0=x.view(-1,1)
        
        x1=F.leaky_relu(self.fc0(x0))
        x1=F.leaky_relu(self.fc1(x1)+self.fc1_i(x0))
        x1=self.fc2(x1)+self.fc2_i(x0)
        return x1