Using model parallel for hessian vector product

I have alexnet:

(0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
(1): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(2): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(5): Linear(in_features=9216, out_features=4096, bias=True)
(6): Linear(in_features=4096, out_features=4096, bias=True)
(7): Linear(in_features=4096, out_features=102, bias=True)

which I am trying to calculate hessian vector product for. The Conv layers are put on cuda:0 and the linear layers on cuda:1. I have the following code, which make sure the vector and the gradient are on the same gpu, but I get an error. stored_grad is a vector of all the gradients.

class HVPOperator(object):

def __init__(self, model, loss, device):
    
    self.device = device
    self.model = model
    self.loss = loss  # loss function
    self.stored_grad = None  # stored gradient (on CPU)
    self.count_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)

    
def structured_vector(self, flat_vector):
    structured =[]
    start = 0
    for param in self.model.parameters():
        if param.requires_grad:
            size = param.shape.numel()
            vec = torch.reshape(flat_vector[start: (start+size)], param.shape)
            structured.append(vec.double().to(param.device))
            start += size
    return structured


def Hvp(self, vec, storedGrad=False):
    # Returns H*vec where H is the hessian of the loss w.r.t. the vectorized model parameters

    # convert numpy array to torch tensor
    if type(vec) is np.ndarray:
        vec = torch.from_numpy(vec)
    vec = vec.to(self.device)
    vec = vec.double()  # convert to double

    # compute original gradient, tracking computation graph
    grad_dict = self.structured_vector(self.stored_grad)

    self.zero_grad()
    gvp = [torch.sum(g*v) for g,v in zip(grad_dict, vec)]
    grad_dict = torch.autograd.grad(gvp, self.model.parameters(), create_graph=True)
    hessian_vec_prod = [v.double() for v in grad_dict]


    return hessian_vec_prod.data.double()

I get the following error which happens at index 10, meaning the kernel weights for first linear layer. Any idea how to solve this?

grad_dict = torch.autograd.grad(gvp, self.model.parameters(), create_graph=True)
File "/usr/local/lib/python3.5/dist-packages/torch/autograd/__init__.py", line 157, in grad
inputs, allow_unused)
RuntimeError: Function CatBackward returned an invalid gradient at index 10 - expected device cuda:1 but got cuda:0