Get different gradients by torch.autograd.grad and torch.gradients

The framework of my network is like

import torch
class Net(torch.nn.Module):    
    def __init__(self, dim = [1,20,1]):
        super(Net, self).__init__()
        self._net = FCN(dim[0],dim[1],dim[-1])
 
    def forward(self, u, x):
        y_tr = self._net(x)
        y_out = torch.einsum('BbO,BSbO->BSbO',u,y_tr)
        return y_out
class FCN(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(FCN, self).__init__()
        self.hidden1 = torch.nn.Linear(n_feature, n_hidden,bias=True) 
        self.hidden2 = torch.nn.Linear(n_hidden,  n_hidden,bias=True)
        self.hidden3 = torch.nn.Linear(n_hidden,  n_hidden,bias=True)
        self.hidden4 = torch.nn.Linear(n_hidden,  n_hidden,bias=True)
        self.predict = torch.nn.Linear(n_hidden,  n_output,bias=True)   
    def forward(self, y):
        x = self.hidden1(y)
        x = torch.sin(x)              
        x = self.hidden2(x)
        x = torch.sin(x)
        x = self.hidden3(x)
        x = torch.sin(x)
        x = self.hidden4(x)
        x = torch.sin(x)
        x = self.predict(x)                        
        return x

The input of the Net is

t = torch.linspace(0.0125,1,80).view(80,1).repeat(5,40,1,1)
t.requires_grad = True
tsize = t.size()
u = torch.rand(tsize[0],tsize[2],20)

Then I want to calculate the derivative of the output of Net with respect to t. There are 2 ways to implement it.

One is the torch.autograd.grad, the other is calculating it numerically by torch.gradient.

model = Net(dim=[1,100,20])
test = model(u,t)
Grad_auto = torch.autograd.grad(test[:,0,:,0],t,retain_graph=True,grad_outputs=torch.ones_like(test[:,0,:,0]))[0][0,0,:,0]
Grad_Num = torch.gradient(test[:,0,:,0],spacing=0.0125, edge_order=2, dim=1)[0][0,:]

The results of Grad_Num and Grad_auto are different. Is there anyone having similar situation? Any comment or help is appreciated!

There are some updates about the test.

When calculating the gradients of y_tr w.r.t x, the output of torch.gradient and torch.autograd.grad are almost the same, but once applying the torch.einsum, they get 2 different solution. I just checked, the results of torch.gradient are correct by 2nd order finite difference scheme.

Further details

model = Net(dim=[1,100,20])
test = model._Net(t)
Grad_auto = torch.autograd.grad(test[:,0,:,0],t,retain_graph=True,grad_outputs=torch.ones_like(test[:,0,:,0]))[0][0,0,:,0]
Grad_Num = torch.gradient(test[:,0,:,0],spacing=0.0125, edge_order=2, dim=1)[0][0,:]

I will get

Grad_auto: tensor([-0.0302, -0.0299, -0.0297, -0.0294, -0.0291, -0.0288, -0.0285, -0.0281,
        -0.0278, -0.0274, -0.0271, -0.0267, -0.0263, -0.0259, -0.0255, -0.0251,
        -0.0247, -0.0243, -0.0238, -0.0234, -0.0229, -0.0225, -0.0220, -0.0215,
        -0.0210, -0.0205, -0.0200, -0.0195, -0.0190, -0.0185, -0.0179, -0.0174,
        -0.0169, -0.0163, -0.0157, -0.0152, -0.0146, -0.0140, -0.0134, -0.0129,
        -0.0123, -0.0117, -0.0111, -0.0105, -0.0098, -0.0092, -0.0086, -0.0080,
        -0.0074, -0.0067, -0.0061, -0.0055, -0.0048, -0.0042, -0.0036, -0.0029,
        -0.0023, -0.0017, -0.0010, -0.0004,  0.0003,  0.0009,  0.0015,  0.0022,
         0.0028,  0.0035,  0.0041,  0.0047,  0.0053,  0.0060,  0.0066,  0.0072,
         0.0078,  0.0085,  0.0091,  0.0097,  0.0103,  0.0109,  0.0115,  0.0121])
Grad_Num: tensor([-0.0302, -0.0299, -0.0297, -0.0294, -0.0291, -0.0288, -0.0285, -0.0281,
        -0.0278, -0.0274, -0.0271, -0.0267, -0.0263, -0.0259, -0.0255, -0.0251,
        -0.0247, -0.0243, -0.0238, -0.0234, -0.0229, -0.0225, -0.0220, -0.0215,
        -0.0210, -0.0205, -0.0200, -0.0195, -0.0190, -0.0185, -0.0179, -0.0174,
        -0.0169, -0.0163, -0.0157, -0.0152, -0.0146, -0.0140, -0.0134, -0.0129,
        -0.0123, -0.0117, -0.0111, -0.0105, -0.0098, -0.0092, -0.0086, -0.0080,
        -0.0074, -0.0067, -0.0061, -0.0055, -0.0048, -0.0042, -0.0036, -0.0029,
        -0.0023, -0.0017, -0.0010, -0.0004,  0.0003,  0.0009,  0.0015,  0.0022,
         0.0028,  0.0035,  0.0041,  0.0047,  0.0053,  0.0060,  0.0066,  0.0072,
         0.0078,  0.0085,  0.0091,  0.0097,  0.0103,  0.0109,  0.0115,  0.0121],
       grad_fn=<SliceBackward0>)

they are almost the same, but if I consider to compute

model = Net(dim=[1,100,20])
test = model(u,t)
Grad_auto = torch.autograd.grad(test[:,0,:,0],t,retain_graph=True,grad_outputs=torch.ones_like(test[:,0,:,0]))[0][0,0,:,0]
Grad_Num = torch.gradient(test[:,0,:,0],spacing=0.0125, edge_order=2, dim=1)[0][0,:] 

I will get

Grad_auto: tensor([-4.5219e-02, -1.6293e-02, -1.5394e-02, -1.5503e-02, -5.1214e-03,
        -2.3390e-02, -3.6110e-02, -5.1451e-02, -2.7668e-02, -1.5929e-02,
        -2.4481e-02, -2.5683e-02, -2.4750e-02, -4.6403e-02, -8.4468e-03,
        -3.6779e-02, -1.5549e-02, -4.8169e-02, -2.4843e-03, -1.3870e-02,
        -4.4307e-02, -1.7717e-02, -2.9056e-02, -4.5341e-02, -1.7182e-02,
        -8.7080e-03, -4.4529e-02, -3.1706e-02, -3.4113e-03, -1.7208e-02,
        -3.1676e-02, -2.6727e-02, -3.8975e-02, -3.1970e-03, -1.3167e-02,
        -1.4034e-02, -2.8083e-02, -3.0505e-03, -1.1408e-02, -2.3052e-02,
        -1.2609e-02, -2.1820e-02, -1.7178e-02, -3.3568e-02, -7.4343e-03,
        -8.7557e-03, -2.0096e-02, -1.1489e-03, -9.1030e-03, -1.6797e-02,
        -2.7509e-02, -2.7829e-02, -2.6466e-02, -2.4330e-02, -1.2996e-02,
        -2.2515e-02, -2.8248e-05, -5.0883e-03, -2.0221e-02, -1.8988e-02,
        -1.7534e-02, -1.5191e-05, -1.6414e-04, -5.9557e-03, -3.1696e-04,
        -4.5885e-03, -6.3587e-03, -3.7191e-03, -7.9422e-03, -3.0674e-04,
        -8.3384e-03, -1.3917e-02, -8.3659e-03, -1.7350e-03, -1.0139e-02,
        -2.0200e-03, -8.2367e-04, -1.0043e-02, -7.3871e-03, -1.8898e-04])
Grad_Num: tensor([-2.1116e+00, -7.4022e-01, -3.0265e-02, -2.4723e-01,  1.6750e-01,
         6.8699e-01,  5.9923e-01, -2.0730e-01, -7.9020e-01, -8.5504e-02,
         1.8990e-01, -1.0973e-02,  3.9542e-01, -3.3557e-01, -2.1574e-01,
         1.2894e-01,  1.8813e-01, -2.4889e-01, -6.4701e-01,  7.3660e-01,
         5.7942e-02, -2.8741e-01,  4.5057e-01, -2.1279e-01, -6.1757e-01,
         4.2114e-01,  3.5071e-01, -6.5119e-01, -2.3490e-01,  4.0978e-01,
         1.2544e-01,  8.2907e-02, -3.3769e-01, -3.6841e-01,  1.3976e-01,
         1.8268e-01, -1.4540e-01, -2.1980e-01,  2.3584e-01,  7.1297e-03,
        -2.7621e-02,  4.2702e-02,  1.1259e-01, -1.1151e-01, -2.7050e-01,
         1.1953e-01, -7.7464e-02, -1.1314e-01,  1.3835e-01,  1.5239e-01,
         8.0997e-02, -2.5239e-02, -4.4020e-02, -1.1540e-01, -2.8067e-02,
        -9.5211e-02, -1.2568e-01,  1.2231e-01,  7.6269e-02, -2.7935e-02,
        -1.0867e-01, -9.3787e-02,  2.5867e-02,  4.9441e-04, -9.2813e-03,
         1.9442e-02, -5.9800e-03, -1.0348e-03, -1.0048e-02, -5.9261e-03,
         1.6235e-02, -7.4092e-03, -1.6379e-02, -8.5213e-03, -1.9876e-03,
         1.1130e-03, -1.7517e-02, -1.7643e-02,  1.8403e-02,  5.4008e-02],
       grad_fn=<SliceBackward0>)

The only difference is that the later case uses the torch.einsum.

Following the discussion above, it should be a temporary workaround.