loss function backward method returns device error

I’m encountering an issue in my neural network implementation using PyTorch. Despite my efforts to ensure all tensors are on the same device, calling the backward method on my custom loss function triggers a RuntimeError, complaining about tensors being on different devices (cuda:0 and cpu). Here’s the core of my problem (assuming X, y are tensors from same device):

class Model(nn.Module):
    def __init__(self, X, y, lambda_):
        super().__init__()
        self.device = X.device
        self.to(self.device)

        # Parameters
        self.lamb = lambda_
        self.X, self.y = X, y

        self.layer1 = nn.Linear(self.p1, self.p2, device = self.device, dtype = torch.float64)
        self.layer2 = nn.Linear(self.p2, 1, device = self.device, dtype = torch.float64)

         
    def fit(self):
        loss_fn = CustomRegressionLoss(lambda_=self.lamb).to(self.device)
        bare_loss_fn = nn.MSELoss(reduction = 'mean')
        min_lr = 0.0001
        optimizer = torch.optim.SGD(params=self.parameters(), lr=0.01, momentum=0.9, dampening=0, nesterov=True)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=4, factor=0.99, min_lr=min_lr)

        y_pred = self(self.X).to(self.device)
        loss = loss_fn(y_pred, self.y, self.layer1)
        bare_loss = bare_loss_fn(y_pred, self.y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step(loss) 

My model’s forward pass (self(self.X)) returns predictions, which I explicitly move to the same device as my inputs.

Given these precautions, I’m puzzled about the source of the “different devices” error. Could there be a hidden detail I’m overlooking?

Additionally, here’s the implementation of my custom loss function for context:

class CustomRegressionLoss(nn.Module):
    def __init__(self, lambda_):
        super().__init__()
        self.lamb = lambda_
        self.mse_loss = nn.MSELoss(reduction='sum')

    def forward(self, input, target, w1):
        square_root_lasso_loss = torch.sqrt(self.mse_loss(input, target))
        lasso_regularization = self.lamb * (torch.abs(w1.weight).sum() + torch.abs(w1.bias).sum())
        total_loss = square_root_lasso_loss + lasso_regularization
        return total_loss

Edit: Here’s the error log:

RuntimeError                              Traceback (most recent call last)
Cell In [2], line 14
     11 features = pd.read_csv(f"dataSets{j}/s{i}/{i}-important_inds.csv", header = None)[1:].values
     13 model = an.AnnLasso(name="test", mode="regression", warm_start=True)
---> 14 model.fit(X_train, y_train, print_epochs=False, graph=False)
     15 pesr_history[i].append(model.results_analysis(X_test, y_test, features)['exact_recovery'])

File /notebooks/AnnLasso.py:50, in AnnLasso.fit(self, X_train, y_train, X_test, y_test, print_epochs, set_name, graph)
     48         else:
     49             model = rm.AnnLassoRegressionModel(X_train, y_train, lambi, X_test, y_test, self.lr, n_hidden_units=self.p2, init_weights=[model.layer1, model.layer2])
---> 50         model.fit(print_epochs, graph)
     51 else:
     52     model = rm.AnnLassoRegressionModel(X_train, y_train, self.lambda_qut, X_test, y_test)

File /notebooks/RegressionModel.py:66, in AnnLassoRegressionModel.fit(self, print_epochs, graph)
     64 bare_loss = bare_loss_fn(y_pred, self.y)
     65 optimizer.zero_grad()
---> 66 loss.backward()
     68 test_loss = bare_loss_fn(predict(self, self.X_test), self.y_test) if hasattr(self, 'X_test') else ''
     70 if epoch % 100 == 0:

File /usr/local/lib/python3.9/dist-packages/torch/_tensor.py:396, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    387 if has_torch_function_unary(self):
    388     return handle_torch_function(
    389         Tensor.backward,
    390         (self,),
   (...)
    394         create_graph=create_graph,
    395         inputs=inputs)
--> 396 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)

File /usr/local/lib/python3.9/dist-packages/torch/autograd/__init__.py:173, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    168     retain_graph = create_graph
    170 # The reason we repeat same the comment below is that
    171 # some Python versions print out the first line of a multi-line function
    172 # calls in the traceback and some print out the last line
--> 173 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    174     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    175     allow_unreachable=True, accumulate_grad=True)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Your code is not executable as e.g. self.p1 etc. are undefined. Could you add the missing pieces to that we could copy/paste and reproduce the issue?