I am trying to calculate gradients of output with respect to input of a network that contains recurrent layers.

I create input variable with `requires_grad = True`

, run forward pass and backpropagation, but the gradient with respect to input is None. The same approach works fine for convolutional networks, the input is a leaf variable, so I don’t think that `register_hook`

is the way to go.

Here is some minimal example:

```
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
sequence_length = 97
print(sequence_length)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.gru = nn.GRU(input_size=5,hidden_size=5, num_layers=1, batch_first=True)
self.linear =nn.Linear(5,1)
def forward(self, x, h0):
h0 = h0.permute(1,0,2)
out,h = self.gru(x,h0)
x = out[:,-1,:]
x = self.linear(x)
x = torch.sigmoid(x)
return x
model = Net()
model =model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()
print("model")
data = Variable(torch.normal(torch.zeros(4, sequence_length, 5), torch.ones(4, sequence_length,5)),requires_grad= True)
data = data.cuda()
label = torch.from_numpy(np.array([1,0,1,0]).astype('float32'))
label = label.cuda()
print("data")
for epoch in range(1000):
h0 = Variable(torch.randn(4,1,5).cuda())
output = model(data, h0)
print(output.shape)
print(output)
loss = criterion(output, label)
print("loss ", float(loss))
optimizer.zero_grad()
loss.backward()
print(data.grad)
optimizer.step()
event = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 1 * len(data),
1,
100. * 1 / 1,
float(loss) / 1)
print(event)
```

What is the best way to get the gradients with respect to input?