Interesting thing happens when printing inside backward:
grad_output tensor([[-0.0003, -0.0003, -0.0003, ..., -0.0003, -0.0003, -0.0003],
[-0.0003, -0.0003, -0.0003, ..., -0.0003, -0.0003, -0.0003],
[-0.0003, -0.0003, -0.0003, ..., -0.0003, -0.0003, -0.0003],
...,
[-0.0003, -0.0003, -0.0003, ..., -0.0003, -0.0003, -0.0003],
[-0.0003, -0.0003, -0.0003, ..., -0.0003, -0.0003, -0.0003],
[-0.0003, -0.0003, -0.0003, ..., -0.0003, -0.0003, -0.0003]])
grad_output.shape torch.Size([100, 64])
type(ctx) <class 'torch.autograd.function.LinearFunctionBackward'>
ctx.saved_tensors (tensor([[-0.0130, 0.1299, -0.1079, ..., -0.0327, 0.0201, 0.1370],
[-0.0130, 0.1299, -0.1079, ..., -0.0327, 0.0201, 0.1370],
[-0.0130, 0.1299, -0.1079, ..., -0.0327, 0.0201, 0.1370],
...,
[-0.0130, 0.1299, -0.1079, ..., -0.0327, 0.0201, 0.1370],
[-0.0130, 0.1299, -0.1079, ..., -0.0327, 0.0201, 0.1370],
[-0.0130, 0.1299, -0.1079, ..., -0.0327, 0.0201, 0.1370]],
grad_fn=<LinearFunctionBackward>), tensor([[-0.0343, 0.0935, 0.0341, ..., 0.0397, -0.0944, 0.0636],
[ 0.0004, 0.0325, 0.0200, ..., -0.0412, -0.0044, 0.0804],
[-0.0924, 0.0477, 0.0244, ..., 0.0977, 0.0955, 0.0971],
...,
[-0.0924, -0.0543, -0.0400, ..., -0.0085, -0.0072, 0.0443],
[-0.0646, 0.0025, -0.0068, ..., 0.0974, -0.0356, 0.0807],
[ 0.0744, -0.0510, -0.0750, ..., 0.0472, 0.0138, 0.0920]],
requires_grad=True), tensor([ 0.0030, -0.0111, -0.0229, -0.0249, -0.0989, 0.0892, 0.0206, 0.0536,
0.0978, 0.0473, -0.0724, -0.0930, 0.0703, 0.0183, -0.0407, 0.0490,
0.0304, -0.0751, -0.0962, 0.0294, -0.0906, -0.0313, 0.0924, -0.0462,
-0.0834, -0.0942, -0.0217, 0.0581, 0.0379, -0.0543, 0.0411, -0.0707,
-0.0958, 0.0415, -0.0499, -0.0942, -0.0491, 0.0283, 0.0752, 0.0812,
0.0318, 0.0540, 0.0340, 0.0257, 0.0744, -0.0112, 0.0081, -0.0952,
0.0921, -0.0628, 0.0688, -0.0160, 0.0923, 0.0252, 0.0945, -0.0840,
-0.0888, -0.0278, -0.0439, 0.0531, 0.0374, -0.0352, 0.0247, 0.0621],
requires_grad=True))
input.shape torch.Size([100, 64])
input.requires_grad True
weight.shape torch.Size([64, 64])
bias.shape torch.Size([64])
ctx.needs_input_grad (True, True, True)
grad_input tensor([[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
...,
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04]])
grad_weight tensor([[ 0.0004, -0.0039, 0.0032, ..., 0.0010, -0.0006, -0.0041],
[ 0.0004, -0.0040, 0.0033, ..., 0.0010, -0.0006, -0.0042],
[ 0.0004, -0.0041, 0.0034, ..., 0.0010, -0.0006, -0.0043],
...,
[ 0.0004, -0.0043, 0.0036, ..., 0.0011, -0.0007, -0.0046],
[ 0.0004, -0.0040, 0.0033, ..., 0.0010, -0.0006, -0.0042],
[ 0.0004, -0.0036, 0.0030, ..., 0.0009, -0.0006, -0.0038]])
grad_bias tensor([-0.0298, -0.0306, -0.0315, -0.0316, -0.0330, -0.0283, -0.0304, -0.0291,
-0.0278, -0.0285, -0.0349, -0.0339, -0.0281, -0.0317, -0.0338, -0.0310,
-0.0316, -0.0342, -0.0292, -0.0290, -0.0329, -0.0333, -0.0320, -0.0319,
-0.0328, -0.0366, -0.0312, -0.0322, -0.0283, -0.0373, -0.0311, -0.0345,
-0.0319, -0.0297, -0.0338, -0.0322, -0.0320, -0.0278, -0.0304, -0.0298,
-0.0314, -0.0299, -0.0295, -0.0319, -0.0276, -0.0316, -0.0314, -0.0332,
-0.0275, -0.0317, -0.0302, -0.0317, -0.0313, -0.0312, -0.0276, -0.0332,
-0.0362, -0.0335, -0.0309, -0.0311, -0.0269, -0.0334, -0.0308, -0.0278])
grad_output tensor([[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
...,
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04],
[-1.7603e-04, -6.1180e-06, 1.2589e-04, ..., -4.7342e-05,
-1.6849e-04, -1.2383e-04]])
grad_output.shape torch.Size([100, 64])
type(ctx) <class 'torch.autograd.function.LinearFunctionBackward'>
ctx.saved_tensors (tensor([[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.]]), tensor([[ 0.0842, -0.0503, -0.0517],
[ 0.0767, 0.0256, 0.0085],
[-0.0636, 0.0073, -0.0771],
[ 0.0128, 0.0598, 0.0479],
[-0.0554, -0.0358, -0.0874],
[-0.0494, 0.0081, -0.0187],
[-0.0284, -0.0936, -0.0620],
[-0.0569, -0.0838, -0.0695],
[-0.0769, 0.0653, 0.0156],
[ 0.0631, 0.0331, -0.0328],
[ 0.0566, 0.0832, 0.0893],
[ 0.0138, 0.0158, -0.0760],
[-0.0720, -0.0756, 0.0478],
[ 0.0579, -0.0173, 0.0116],
[ 0.0295, 0.0456, 0.0078],
[-0.0311, 0.0026, -0.0419],
[ 0.0587, 0.0382, 0.0799],
[ 0.0206, 0.0552, -0.0481],
[ 0.0311, 0.0883, 0.0899],
[ 0.0028, 0.0687, -0.0498],
[-0.0646, -0.0223, 0.0790],
[ 0.0773, 0.0483, 0.0244],
[-0.0832, -0.0044, 0.0500],
[ 0.0530, -0.0554, -0.0592],
[ 0.0825, -0.0022, -0.0089],
[-0.0434, 0.0201, -0.0196],
[-0.0718, -0.0063, -0.0274],
[-0.0669, -0.0193, 0.0297],
[-0.0522, 0.0782, -0.0273],
[ 0.0184, -0.0488, 0.0038],
[-0.0179, -0.0462, 0.0904],
[-0.0429, 0.0611, -0.0567],
[ 0.0525, -0.0011, 0.0973],
[-0.0022, -0.0830, -0.0755],
[ 0.0401, 0.0580, 0.0095],
[ 0.0870, 0.0267, 0.0700],
[ 0.0374, 0.0876, 0.0032],
[ 0.0487, -0.0419, 0.0846],
[-0.0027, -0.0206, 0.0068],
[ 0.0997, -0.0110, 0.0023],
[ 0.0763, 0.0883, 0.0147],
[ 0.0381, -0.0788, 0.0771],
[-0.0689, 0.0763, -0.0097],
[ 0.0846, -0.0871, 0.0942],
[ 0.0914, -0.0359, -0.0436],
[ 0.0562, -0.0068, -0.0124],
[ 0.0080, -0.0139, -0.0580],
[-0.0250, -0.0011, -0.0274],
[-0.0194, 0.0851, -0.0085],
[-0.0947, 0.0152, 0.0870],
[ 0.0986, -0.0572, 0.0172],
[ 0.0699, -0.0794, 0.0199],
[-0.0658, -0.0908, -0.0678],
[ 0.0060, 0.0376, -0.0140],
[-0.0589, 0.0665, -0.0053],
[ 0.0355, -0.0619, -0.0941],
[ 0.0111, 0.0337, 0.0725],
[-0.0520, -0.0429, -0.0760],
[-0.0894, 0.0772, -0.0128],
[-0.0798, -0.0291, 0.0563],
[-0.0549, 0.0833, 0.0315],
[ 0.0215, 0.0446, -0.0987],
[-0.0185, -0.0308, 0.0578],
[-0.0027, -0.0376, 0.0975]], requires_grad=True), tensor([ 0.0048, 0.0191, 0.0255, -0.0865, -0.0007, 0.0818, -0.0146, -0.0640,
-0.0144, -0.0680, 0.0739, -0.0047, 0.0962, -0.0705, -0.0117, -0.0876,
0.0772, -0.0727, 0.0529, -0.0311, 0.0123, 0.0245, 0.0298, 0.0037,
0.0168, -0.0061, 0.0558, 0.0338, 0.0988, 0.0884, -0.0939, 0.0782,
0.0447, 0.0091, 0.0015, -0.0365, -0.0928, 0.0182, -0.0843, 0.0652,
0.0750, 0.0602, 0.0864, -0.0776, 0.0879, -0.0737, 0.0300, 0.0027,
0.0180, -0.0602, -0.0158, -0.0497, -0.0657, -0.0122, -0.0545, -0.0912,
-0.0768, 0.0527, 0.0294, 0.0801, -0.0027, -0.0001, 0.0116, 0.0798],
requires_grad=True))
input.shape torch.Size([100, 3])
input.requires_grad False
weight.shape torch.Size([64, 3])
bias.shape torch.Size([64])
ctx.needs_input_grad (False, True, True)
grad_weight tensor([[-0.0176, -0.0176, -0.0176],
[-0.0006, -0.0006, -0.0006],
[ 0.0126, 0.0126, 0.0126],
[-0.0095, -0.0095, -0.0095],
[-0.0203, -0.0203, -0.0203],
[ 0.0125, 0.0125, 0.0125],
[-0.0236, -0.0236, -0.0236],
[-0.0098, -0.0098, -0.0098],
[-0.0102, -0.0102, -0.0102],
[ 0.0024, 0.0024, 0.0024],
[-0.0149, -0.0149, -0.0149],
[ 0.0075, 0.0075, 0.0075],
[ 0.0161, 0.0161, 0.0161],
[ 0.0039, 0.0039, 0.0039],
[ 0.0031, 0.0031, 0.0031],
[ 0.0025, 0.0025, 0.0025],
[ 0.0088, 0.0088, 0.0088],
[ 0.0129, 0.0129, 0.0129],
[-0.0175, -0.0175, -0.0175],
[ 0.0151, 0.0151, 0.0151],
[-0.0161, -0.0161, -0.0161],
[-0.0131, -0.0131, -0.0131],
[-0.0135, -0.0135, -0.0135],
[ 0.0094, 0.0094, 0.0094],
[ 0.0132, 0.0132, 0.0132],
[ 0.0022, 0.0022, 0.0022],
[-0.0160, -0.0160, -0.0160],
[-0.0109, -0.0109, -0.0109],
[ 0.0171, 0.0171, 0.0171],
[-0.0049, -0.0049, -0.0049],
[ 0.0328, 0.0328, 0.0328],
[ 0.0063, 0.0063, 0.0063],
[ 0.0046, 0.0046, 0.0046],
[-0.0100, -0.0100, -0.0100],
[-0.0134, -0.0134, -0.0134],
[ 0.0206, 0.0206, 0.0206],
[ 0.0056, 0.0056, 0.0056],
[-0.0013, -0.0013, -0.0013],
[ 0.0190, 0.0190, 0.0190],
[ 0.0124, 0.0124, 0.0124],
[-0.0058, -0.0058, -0.0058],
[-0.0196, -0.0196, -0.0196],
[ 0.0174, 0.0174, 0.0174],
[ 0.0144, 0.0144, 0.0144],
[-0.0022, -0.0022, -0.0022],
[-0.0100, -0.0100, -0.0100],
[-0.0004, -0.0004, -0.0004],
[-0.0023, -0.0023, -0.0023],
[-0.0202, -0.0202, -0.0202],
[-0.0018, -0.0018, -0.0018],
[-0.0095, -0.0095, -0.0095],
[ 0.0012, 0.0012, 0.0012],
[-0.0105, -0.0105, -0.0105],
[ 0.0061, 0.0061, 0.0061],
[ 0.0087, 0.0087, 0.0087],
[-0.0132, -0.0132, -0.0132],
[-0.0198, -0.0198, -0.0198],
[ 0.0028, 0.0028, 0.0028],
[ 0.0237, 0.0237, 0.0237],
[ 0.0028, 0.0028, 0.0028],
[ 0.0136, 0.0136, 0.0136],
[-0.0047, -0.0047, -0.0047],
[-0.0168, -0.0168, -0.0168],
[-0.0124, -0.0124, -0.0124]])
grad_bias tensor([-0.0176, -0.0006, 0.0126, -0.0095, -0.0203, 0.0125, -0.0236, -0.0098,
-0.0102, 0.0024, -0.0149, 0.0075, 0.0161, 0.0039, 0.0031, 0.0025,
0.0088, 0.0129, -0.0175, 0.0151, -0.0161, -0.0131, -0.0135, 0.0094,
0.0132, 0.0022, -0.0160, -0.0109, 0.0171, -0.0049, 0.0328, 0.0063,
0.0046, -0.0100, -0.0134, 0.0206, 0.0056, -0.0013, 0.0190, 0.0124,
-0.0058, -0.0196, 0.0174, 0.0144, -0.0022, -0.0100, -0.0004, -0.0023,
-0.0202, -0.0018, -0.0095, 0.0012, -0.0105, 0.0061, 0.0087, -0.0132,
-0.0198, 0.0028, 0.0237, 0.0028, 0.0136, -0.0047, -0.0168, -0.0124])
Shortly:
input.requires_grad True
input.requires_grad False
Printing here:
def backward(ctx, grad_output):
# This is a pattern that is very convenient - at the top of backward
# unpack saved_tensors and initialize all gradients w.r.t. inputs to
# None. Thanks to the fact that additional trailing Nones are
# ignored, the return statement is simple even when the function has
# optional inputs.
print('grad_output', grad_output)
print('grad_output.shape', grad_output.shape)
print('type(ctx)', type(ctx))
input, weight, bias = ctx.saved_tensors
print('ctx.saved_tensors', ctx.saved_tensors)
print('input.shape', input.shape)
print('input.requires_grad', input.requires_grad)
print('weight.shape', weight.shape)
print('bias.shape', bias.shape)
grad_input = grad_weight = grad_bias = None
So, while being inside PyTorch x.requires_grad changes according to some circumstances.
How does this algorithm works to know to be safe making custom autograd.Function
's?