Some timing results:
|
time |
error untrained |
error trained |
|
manual differentiation |
0:03:16 |
353729.12 |
10.401332 |
|
autodiff with .grad and .sum()
|
0:04:00 |
353527.53 |
16.774494 |
|
autodiff with .grad and grad_outputs ones |
0:03:53 |
353527.53 |
16.774494 |
|
autodiff with .grad and .sum() w.r.t both inputs |
0:03:51 |
353527.53 |
16.774494 |
|
autodiff with .backward()
|
0:05:32 |
353729.12 |
3889350.8 |
|
Here manual differentiation is to be considered ground truth. Autodiff does add some overhead. It seems like using grad
is the winner w.r.t speed but numerically it (marginally) differs from ground truth.
When using .backward()
as I initially proposed, the training loss is going down and the forward propagation before training is numerically identical to ground truth. I don’t know what happened after but I think its accumulating gradients somehow somewhere in a bad way.
Code
import torch
from datetime import datetime
# print(torch.__version__)
#> '0.4.0a0+059299b'
from torch.autograd import Variable
from torch import nn
from torch.nn.parameter import Parameter
import torch.optim as optim
def dfdx(x,param,f):
f_val = f(x,param)
dfdx_val, = torch.autograd.grad(
outputs= [f_val.sum()],
inputs = [x],
grad_outputs=None,
retain_graph=True,
create_graph=True,
only_inputs=True,
allow_unused=True
)
return dfdx_val
def dfdx2(x,param,f):
f_val = f(x,param)
dfdx_val, = torch.autograd.grad(
outputs= [f_val],
inputs = [x],
grad_outputs=torch.ones_like(f_val),
retain_graph=True,
create_graph=True,
only_inputs=True,
allow_unused=True
)
return dfdx_val
def dfdx3(x,param,f):
f_val = f(x,param)
dfdx_val,_ = torch.autograd.grad(
outputs= [f_val.sum()],
inputs = [x,param],
grad_outputs=None,
retain_graph=True,
create_graph=True,
only_inputs=True,
allow_unused=True
)
return dfdx_val
def dfdx4(x,param,f):
x = Variable(x.data,requires_grad=True)
z = f(x,param)
z.backward([torch.ones_like(z)],create_graph=True)
dfdx = x.grad
return dfdx
class Model(torch.nn.Module):
def __init__(self,manual_diff):
super(Model, self).__init__()
self.linear1 = nn.Linear(1, 1)
self.linear2 = nn.Linear(1, 1)
self.unneccessary_op1 = nn.Linear(1, 100)
self.unneccessary_op2= nn.Linear(100, 1)
self.unneccessary_op3= nn.Linear(1, 1)
# this functional should be 2(x+0.01*y**2) if diff works.
self.functional = lambda x,y : 100+(x+0.01*y**2)**2
self.functionaldfdx = lambda x,y : 2*(x+0.01*y**2)
self.manual_diff = manual_diff
def forward(self, x):
# Add stupid ops to make sure the .sum() in dfdx is not only real work.
x = self.unneccessary_op1(x)
x = self.unneccessary_op2(x)
x = self.unneccessary_op3(x)
x = self.linear1(x)
z = self.linear2(x) # Should be learned to be constant 0
if self.manual_diff:
x = self.functionaldfdx(x,z)
else:
x = dfdx(x,z,self.functional)
return x
criterion = nn.MSELoss()
def experiment():
start_time = datetime.now()
batch_size = 3000
for i in range(100001):
def closure():
x = Variable(torch.randn(batch_size,1))
y = model(x)
loss = criterion(y,x)
optimizer.zero_grad()
loss.backward()
# if i%10000==0:
# print(i,datetime.now() - start_time,loss.data.numpy())
return loss
optimizer.step(closure)
time_elapsed = datetime.now() - start_time
print('Time elapsed {}'.format(time_elapsed))
def evaluate():
x = Variable(torch.ones(1000,1).cumsum(0))
y_pred = model(x)
y_solution = x
return(criterion(y_pred,y_solution).data.numpy())
torch.manual_seed(1)
model = Model(manual_diff = True)
print('Using manual differentiation')
print('error before: ',evaluate())
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
experiment()
print('error after: ',evaluate())
torch.manual_seed(1)
model = Model(manual_diff = False)
print('Using autodiff with .sum()')
print('error before: ',evaluate())
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
experiment()
print('error after: ',evaluate())
dfdx = dfdx2
torch.manual_seed(1)
model = Model(manual_diff = False)
print('Using autodiff with grad_outputs ones')
print('error before: ',evaluate())
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
experiment()
print('error after: ',evaluate())
dfdx = dfdx3
torch.manual_seed(1)
model = Model(manual_diff = False)
print('Using autodiff with .sum() w.r.t both inputs')
print('error before: ',evaluate())
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
experiment()
print('error after: ',evaluate())
dfdx = dfdx4
torch.manual_seed(2)
model = Model(manual_diff = False)
print('Using autodiff with .backward()')
print('error before: ',evaluate())
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
experiment()
print('error after: ',evaluate())
Also, big thanks to Yunjey for sharing this gist