I’ve implemented a simple linear regression model using PyTorch and attempted to accelerate training using a GPU. Despite this, I haven’t observed any improvement in training time. Profiling the model yields a significant CPU bottleneck rather than expected GPU optimization. Here’s a breakdown of the profiler’s output:
Top 10 Functions by CPU Time:
model_fit: CPU time = 6268886.00 us
Optimizer.step#FISTA.step: CPU time = 402481.00 us
cudaLaunchKernel: CPU time = 302777.00 us
aten::to: CPU time = 109666.00 us
aten::to_copy: CPU time = 107876.00 us
aten::mul: CPU time = 85097.00 us
aten::copy: CPU time = 82672.00 us
aten::abs: CPU time = 78300.00 us
aten::quantile: CPU time = 75502.00 us
aten::empty_strided: CPU time = 70781.00 us
Top 10 Functions by CUDA Time:
model_fit: CUDA time = 41545.00 us
Optimizer.step#FISTA.step: CUDA time = 23568.00 us
aten::mul: CUDA time = 8151.00 us
aten::sum: CUDA time = 5403.00 us
aten::sub: CUDA time = 4679.00 us
void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array<char*, 3> >(int, at::native::CUDAFunctor_add, at::detail::Array<char*, 3>): CUDA time = 4208.00 us
aten::copy_: CUDA time = 4170.00 us
aten::mse_loss: CUDA time = 3822.00 us
aten::abs: CUDA time = 3601.00 us
It appears that a considerable amount of computation still happens on the CPU, especially multiplicative operations and data transfers (aten::to
, aten::_to_copy
). Here’s the relevant part of my model code:
class LinearRegressionLoss(nn.Module):
def __init__(self, lambda_, nu):
super().__init__()
self.lamb = lambda_
self.nu = nu
self.mse_loss = nn.MSELoss(reduction='sum')
def forward(self, input, target, theta):
square_root_lasso_loss = torch.sqrt(self.mse_loss(input, target))
regularization = self.lamb * (torch.sum(torch.abs(theta.weight)) + torch.abs(theta.bias))
total_loss = square_root_lasso_loss + regularization
return total_loss, square_root_lasso_loss
class LinearModel():
def __init__(self, lambda_):
self.path_type = path_type
self.lambda_= lambda_
self.device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
def fit(self, X, y, verbose=False):
# Data Processing
X = torch.tensor(X.values, dtype=torch.float, device=self.device)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = torch.tensor(y.values.squeeze(), dtype=torch.float, device=self.device)
# Model Parameters
self.theta = nn.Linear(X.shape[1], 1, dtype=torch.float, device=self.device)
# Training
for i in range(-1, 6):
init_lr = 0.1
lambi = self.lambda_ * (np.exp(i) / (1 + np.exp(i)) if i < 5 else 1)
rel_err = 1e-11 if i==5 else 1e-5
if verbose:
print(f"Lambda = {lambi.item():.4f}")
self.train_model(X, y, lambi, init_lr, rel_err, verbose)
if verbose: print("MODEL FITTED !")
def forward(self, X):
if X.device != self.device:
X = X.to(self.device)
return self.theta(X).squeeze()
def train_model(self, X, y, lambda_, init_lr, rel_err, verbose):
loss_fn = LinearRegressionLoss(lambda_, nu).to(self.device)
train_score_fn = nn.MSELoss(reduction = 'mean').to(self.device)
epoch, last_loss = 0, np.inf
optimizer = FISTA(params=self.theta.parameters(), lr=init_lr, lambda_=lambda_)
lr_factor = 0.9
max_epochs = 10000
while epoch < max_epochs:
optimizer.zero_grad()
y_pred = self.forward(X)
loss, bare_loss = loss_fn(y_pred, y, self.theta)
loss = loss.detach()
train_loss = train_score_fn(y_pred, y).detach()
if loss > last_loss:
learning_rate = optimizer.param_groups[0]['lr']
optimizer = FISTA(params=self.theta.parameters(), lr=learning_rate*lr_factor, lambda_=lambda_)
if epoch % 20 == 0:
if verbose:
print(f"\tEpoch: {epoch} | Loss: {loss.item():.5f}")
if epoch > 0 and abs(loss - last_loss) / loss < rel_err:
if verbose: print(f"\n\t Descent stopped: loss is no longer decreasing significantly.\n")
break
last_loss = loss
epoch += 1
bare_loss.backward()
optimizer.step()
if epoch == max_epochs and verbose: print("FISTA descent stopped: maximum iterations reached")
And the optimizer:
class FISTA(torch.optim.Optimizer):
def __init__(self, params, lr, lambda_):
self.lr = lr
self.lambda_ = lambda_
defaults = dict(lr=lr)
super(FISTA, self).__init__(params, defaults)
def shrinkage_operator(self, u, lambda_):
'''Applies the shrinkage operator to a PyTorch tensor.'''
return u.sign() * torch.clamp(u.abs() - lambda_, min=0.0)
@torch.no_grad()
def step(self, closure=None):
'''Performs a single optimization step.'''
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad
state = self.state[p]
if 'x_prev' not in state:
# We use .detach() to ensure we do not track history
state['x_prev'] = p.detach().clone()
state['y_prev'] = p.detach().clone()
state['t_prev'] = torch.tensor(1., device=p.device)
x_prev, y_prev, t_prev = state['x_prev'], state['y_prev'], state['t_prev']
x_next = self.shrinkage_operator(y_prev - self.lr * grad, self.lr * self.lambda_)
t_next = (1. + torch.sqrt(1. + 4. * t_prev ** 2)) / 2.
y_next = x_next + ((t_prev - 1) / t_next) * (x_next - x_prev)
state['x_prev'].copy_(x_next)
state['y_prev'].copy_(y_next)
state['t_prev'].copy_(t_next)
p.copy_(x_next)
return loss
- Why is the GPU underutilized, and why are operations like
aten::mul
andaten::abs
still performed on the CPU? - How can I ensure that more computation, especially tensor operations, is offloaded to the GPU to improve training efficiency?
Any insights or suggestions on how to better leverage GPU capabilities in this PyTorch model would be greatly appreciated!
Edit : I added a ‘device’ argument to the model class such that if given, the model’s device is forced. Results:
t = time.time()
model = LinearModel(path_type=0)
model.fit(X, y, False)
print(time.time()-t)
-->0.7786831855773926
t = time.time()
model = LinearModel(path_type=0, device='cpu')
model.fit(X, y, False)
print(time.time()-t)
--> 0.800177001953125