I have alexnet:
(0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
(1): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(2): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(5): Linear(in_features=9216, out_features=4096, bias=True)
(6): Linear(in_features=4096, out_features=4096, bias=True)
(7): Linear(in_features=4096, out_features=102, bias=True)
which I am trying to calculate hessian vector product for. The Conv layers are put on cuda:0
and the linear layers on cuda:1
. I have the following code, which make sure the vector and the gradient are on the same gpu, but I get an error. stored_grad
is a vector of all the gradients.
class HVPOperator(object):
def __init__(self, model, loss, device):
self.device = device
self.model = model
self.loss = loss # loss function
self.stored_grad = None # stored gradient (on CPU)
self.count_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
def structured_vector(self, flat_vector):
structured =[]
start = 0
for param in self.model.parameters():
if param.requires_grad:
size = param.shape.numel()
vec = torch.reshape(flat_vector[start: (start+size)], param.shape)
structured.append(vec.double().to(param.device))
start += size
return structured
def Hvp(self, vec, storedGrad=False):
# Returns H*vec where H is the hessian of the loss w.r.t. the vectorized model parameters
# convert numpy array to torch tensor
if type(vec) is np.ndarray:
vec = torch.from_numpy(vec)
vec = vec.to(self.device)
vec = vec.double() # convert to double
# compute original gradient, tracking computation graph
grad_dict = self.structured_vector(self.stored_grad)
self.zero_grad()
gvp = [torch.sum(g*v) for g,v in zip(grad_dict, vec)]
grad_dict = torch.autograd.grad(gvp, self.model.parameters(), create_graph=True)
hessian_vec_prod = [v.double() for v in grad_dict]
return hessian_vec_prod.data.double()
I get the following error which happens at index 10, meaning the kernel weights for first linear layer. Any idea how to solve this?
grad_dict = torch.autograd.grad(gvp, self.model.parameters(), create_graph=True)
File "/usr/local/lib/python3.5/dist-packages/torch/autograd/__init__.py", line 157, in grad
inputs, allow_unused)
RuntimeError: Function CatBackward returned an invalid gradient at index 10 - expected device cuda:1 but got cuda:0