I am implementing my own grad check function as autograd’s built-in one is slow on large tensor input and outputs. What I do is choose some random points on the input tensor and perturbe them to calculate finite differences, then I compare the results with automatic gradient. Here is my implementation,
def grad_check(model, data, target, loss_fn, eps=1e-3, control_num=3, x_list = None, y_list = None):
num_grads = []
orig_grads = []
output_original = model(data)
loss_original = loss_fn(torch.squeeze(output_original), torch.squeeze(target))
print('Loss original: ', loss_original)
loss_original.backward()
grad_original = torch.squeeze(data.grad)
(shape_x, shape_y) = torch.squeeze(data).shape[0], torch.squeeze(data).shape[1]
if x_list is None and y_list is None:
x_list = np.random.randint(0, shape_x, size=control_num)
y_list = np.random.randint(0, shape_y, size=control_num)
with torch.no_grad():
for i in range(control_num):
data_copy = data.clone()
data_copy[x_list[i],y_list[i]] = data_copy[x_list[i],y_list[i]] + eps
output_ptb = model(data_copy)
loss_ptb = loss_fn(torch.squeeze(output_ptb), torch.squeeze(target))
print('Loss numeric: ', loss_ptb)
grad_num = (loss_ptb-loss_original)/eps
num_grads.append(grad_num)
orig_grads.append(grad_original[x_list[i],y_list[i]])
print('Numerical grad: ', num_grads)
print('Original grad: ', orig_grads)
print('Ratio: ', np.array(num_grads)/np.array(orig_grads))
return x_list, y_list
Then I run this function with the following simple model:
class TestModel(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
out = (x+1)/2
return out
When I check the gradients of the above simple model (actually a function without learnable parameters just for demonstration), I observe that while using single precision, gradients don’t match but they match if I use double precision:
# Double calculations
loss_function =nn.MSELoss(reduction="sum")
mymodel = TestModel().double()
input_tensor = torch.randn(140, 120, dtype=torch.double, requires_grad=True)
target_tensor = torch.randn(140, 120, dtype=torch.double, requires_grad=True)
x_list, y_list = grad_check(mymodel, input_tensor, target_tensor, loss_function, eps=1e-3, control_num=5)
Results with double precision:
Numerical grad: [tensor(-0.4685, dtype=torch.float64), tensor(0.6474, dtype=torch.float64), tensor(-0.5744, dtype=torch.float64), tensor(-0.3790, dtype=torch.float64)]
Original grad: [tensor(-0.4687, dtype=torch.float64), tensor(0.6472, dtype=torch.float64), tensor(-0.5746, dtype=torch.float64), tensor(-0.3792, dtype=torch.float64)]
Ratio: [0.99946664 1.00038628 0.99956495 0.9993408 ]
Single precision model:
# Single precision calculations
mymodel = TestModel()
input_tensor = torch.randn(140, 120, requires_grad=True)
target_tensor = torch.randn(140, 120, requires_grad=True)
grad_check(mymodel, input_tensor, target_tensor, loss_function, eps=1e-3, control_num=5, x_list=x_list,
y_list=y_list)
Results with single precision:
Numerical grad: [tensor(-1.9531), tensor(0.), tensor(1.9531), tensor(1.9531)]
Original grad: [tensor(-0.4842), tensor(1.2964), tensor(2.9638), tensor(1.1156)]
Ratio: [4.033863 0. 0.6590007 1.7507753]
What is the reason of this difference? Should I not consider the finite difference grad check to verify if my gradients are true or would that difference create a problem when training my model?