I am trying to set up some simulated data and a simple neural net for better understanding of the fundamentals:

```
import torch
import torch.optim as optim
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
nrows = 9000
ntrain = int(nrows * .7)
X = torch.rand(nrows, 3)
Y = torch.mm(X, torch.from_numpy(
np.array([[.1], [2], [3]]).astype(np.float32)))
# concat two tensors, like hstack in numpy
# Y = torch.cat([Y < torch.mean(Y), Y >= torch.mean(Y)], dim=1).type(torch.LongTensor)
Y = (Y >= torch.mean(Y)).type(torch.LongTensor).view(nrows)
Xtr = X[:ntrain, :]
Ytr = Y[:ntrain]
Xte = X[ntrain:, :]
Yte = Y[ntrain:]
grad_dict: dict = {}
def fc_hook(layer_name, grad_input, grad_output):
if layer_name in grad_dict:
grad_dict[layer_name]["grad_input"].append(grad_input)
grad_dict[layer_name]["grad_output"].append(grad_output)
else :
grad_dict[layer_name] = {}
grad_dict[layer_name]["grad_input"] = []
grad_dict[layer_name]["grad_output"] = []
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.hooked = False
self.fc1 = nn.Linear(3, 20)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(20, 30)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(30, 2)
self.fc1_hook_handle = self.fc1.register_backward_hook(self.fc1_backward_hook)
self.fc2_hook_handle = self.fc2.register_backward_hook(self.fc2_backward_hook)
self.fc3_hook_handle = self.fc3.register_backward_hook(self.fc3_backward_hook)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
return x
def fc1_backward_hook(self, module, grad_input, grad_output): # module is Linear in this case. Ignored.
fc_hook("fc1", grad_input, grad_output)
def fc2_backward_hook(self, module, grad_input, grad_output):
fc_hook("fc2", grad_input, grad_output)
def fc3_backward_hook(self, module, grad_input, grad_output):
fc_hook("fc3", grad_input, grad_output)
net = Net().cuda()
print(net)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=.8)
NUM_EPOCH = 2
NUM_PER_BATCH = 4
# # one pass backprop
# index_pool = np.arange(Xtr.size(0))
# indices = np.random.choice(index_pool, size=NUM_PER_BATCH, replace=False)
# inputs = Xtr[indices, :].cuda()
# labels = Ytr[torch.from_numpy(indices)].cuda()
# inputs, labels = Variable(inputs), Variable(labels)
# outputs = net(inputs)
# optimizer.zero_grad()
# loss = criterion(outputs, labels)
# loss.backward()
# optimizer.step()
# running_loss += loss.data.item()
NUM_EPOCH = 2
NUM_PER_BATCH = 4
index_pool = np.arange(Xtr.size(0))
for epoch in range(NUM_EPOCH): # loop over the dataset multiple times
running_loss = 0.0
for i in index_pool:
indices = np.random.choice(
index_pool, size=NUM_PER_BATCH, replace=False)
inputs = Xtr[indices, :].cuda()
labels = Ytr[torch.from_numpy(indices)].cuda()
inputs, labels = Variable(inputs), Variable(labels)
outputs = net(inputs)
optimizer.zero_grad()
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.data.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
accuracy = torch.mean(
torch.eq(
torch.max(
net(Variable(Xte.cuda())),
dim=1
)[1].cpu(),
Yte
).type(torch.FloatTensor)
)
print("Accuracy of prediction on test dataset: %f" % accuracy.item())
print(
grad_dict["fc2"]["grad_input"][0][0]
)
print(
grad_dict["fc2"]["grad_output"][0][0]
)
grad_dict["fc2"]["grad_input"][1][1] == grad_dict["fc2"]["grad_output"][1][1]
print(grad_dict["fc2"]["grad_input"][0][0].size())
print(grad_dict["fc2"]["grad_input"][0][1].size())
print(grad_dict["fc2"]["grad_input"][0][2].size())
print(grad_dict["fc2"]["grad_output"][0][0].size())
```

Each layer in the neural net has a backward hook, but I don’t understand what `grad_input`

and `grad_output`

actually mean. Could anyone explain? Thanks.