I’m encountering an issue in my neural network implementation using PyTorch. Despite my efforts to ensure all tensors are on the same device, calling the backward method on my custom loss function triggers a RuntimeError, complaining about tensors being on different devices (cuda:0
and cpu
). Here’s the core of my problem (assuming X, y are tensors from same device):
class Model(nn.Module):
def __init__(self, X, y, lambda_):
super().__init__()
self.device = X.device
self.to(self.device)
# Parameters
self.lamb = lambda_
self.X, self.y = X, y
self.layer1 = nn.Linear(self.p1, self.p2, device = self.device, dtype = torch.float64)
self.layer2 = nn.Linear(self.p2, 1, device = self.device, dtype = torch.float64)
def fit(self):
loss_fn = CustomRegressionLoss(lambda_=self.lamb).to(self.device)
bare_loss_fn = nn.MSELoss(reduction = 'mean')
min_lr = 0.0001
optimizer = torch.optim.SGD(params=self.parameters(), lr=0.01, momentum=0.9, dampening=0, nesterov=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=4, factor=0.99, min_lr=min_lr)
y_pred = self(self.X).to(self.device)
loss = loss_fn(y_pred, self.y, self.layer1)
bare_loss = bare_loss_fn(y_pred, self.y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
scheduler.step(loss)
My model’s forward pass (self(self.X)
) returns predictions, which I explicitly move to the same device as my inputs.
Given these precautions, I’m puzzled about the source of the “different devices” error. Could there be a hidden detail I’m overlooking?
Additionally, here’s the implementation of my custom loss function for context:
class CustomRegressionLoss(nn.Module):
def __init__(self, lambda_):
super().__init__()
self.lamb = lambda_
self.mse_loss = nn.MSELoss(reduction='sum')
def forward(self, input, target, w1):
square_root_lasso_loss = torch.sqrt(self.mse_loss(input, target))
lasso_regularization = self.lamb * (torch.abs(w1.weight).sum() + torch.abs(w1.bias).sum())
total_loss = square_root_lasso_loss + lasso_regularization
return total_loss
Edit: Here’s the error log:
RuntimeError Traceback (most recent call last)
Cell In [2], line 14
11 features = pd.read_csv(f"dataSets{j}/s{i}/{i}-important_inds.csv", header = None)[1:].values
13 model = an.AnnLasso(name="test", mode="regression", warm_start=True)
---> 14 model.fit(X_train, y_train, print_epochs=False, graph=False)
15 pesr_history[i].append(model.results_analysis(X_test, y_test, features)['exact_recovery'])
File /notebooks/AnnLasso.py:50, in AnnLasso.fit(self, X_train, y_train, X_test, y_test, print_epochs, set_name, graph)
48 else:
49 model = rm.AnnLassoRegressionModel(X_train, y_train, lambi, X_test, y_test, self.lr, n_hidden_units=self.p2, init_weights=[model.layer1, model.layer2])
---> 50 model.fit(print_epochs, graph)
51 else:
52 model = rm.AnnLassoRegressionModel(X_train, y_train, self.lambda_qut, X_test, y_test)
File /notebooks/RegressionModel.py:66, in AnnLassoRegressionModel.fit(self, print_epochs, graph)
64 bare_loss = bare_loss_fn(y_pred, self.y)
65 optimizer.zero_grad()
---> 66 loss.backward()
68 test_loss = bare_loss_fn(predict(self, self.X_test), self.y_test) if hasattr(self, 'X_test') else ''
70 if epoch % 100 == 0:
File /usr/local/lib/python3.9/dist-packages/torch/_tensor.py:396, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
387 if has_torch_function_unary(self):
388 return handle_torch_function(
389 Tensor.backward,
390 (self,),
(...)
394 create_graph=create_graph,
395 inputs=inputs)
--> 396 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File /usr/local/lib/python3.9/dist-packages/torch/autograd/__init__.py:173, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
168 retain_graph = create_graph
170 # The reason we repeat same the comment below is that
171 # some Python versions print out the first line of a multi-line function
172 # calls in the traceback and some print out the last line
--> 173 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
174 tensors, grad_tensors_, retain_graph, create_graph, inputs,
175 allow_unreachable=True, accumulate_grad=True)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!