GPU Utilization Not Improving Training Time in PyTorch Regression Model

I’ve implemented a simple linear regression model using PyTorch and attempted to accelerate training using a GPU. Despite this, I haven’t observed any improvement in training time. Profiling the model yields a significant CPU bottleneck rather than expected GPU optimization. Here’s a breakdown of the profiler’s output:

Top 10 Functions by CPU Time:
model_fit: CPU time = 6268886.00 us
Optimizer.step#FISTA.step: CPU time = 402481.00 us
cudaLaunchKernel: CPU time = 302777.00 us
aten::to: CPU time = 109666.00 us
aten::to_copy: CPU time = 107876.00 us
aten::mul: CPU time = 85097.00 us
aten::copy
: CPU time = 82672.00 us
aten::abs: CPU time = 78300.00 us
aten::quantile: CPU time = 75502.00 us
aten::empty_strided: CPU time = 70781.00 us

Top 10 Functions by CUDA Time:
model_fit: CUDA time = 41545.00 us
Optimizer.step#FISTA.step: CUDA time = 23568.00 us
aten::mul: CUDA time = 8151.00 us
aten::sum: CUDA time = 5403.00 us
aten::sub: CUDA time = 4679.00 us
void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, at::detail::Array<char*, 3> >(int, at::native::CUDAFunctor_add, at::detail::Array<char*, 3>): CUDA time = 4208.00 us
aten::copy_: CUDA time = 4170.00 us
aten::mse_loss: CUDA time = 3822.00 us
aten::abs: CUDA time = 3601.00 us

It appears that a considerable amount of computation still happens on the CPU, especially multiplicative operations and data transfers (aten::to , aten::_to_copy ). Here’s the relevant part of my model code:

class LinearRegressionLoss(nn.Module):
    def __init__(self, lambda_, nu):
        super().__init__()
        self.lamb = lambda_
        self.nu = nu
        self.mse_loss = nn.MSELoss(reduction='sum')

    def forward(self, input, target, theta):
        square_root_lasso_loss = torch.sqrt(self.mse_loss(input, target))
        regularization = self.lamb * (torch.sum(torch.abs(theta.weight)) + torch.abs(theta.bias))
        total_loss = square_root_lasso_loss + regularization
        return total_loss, square_root_lasso_loss

class LinearModel():
    def __init__(self, lambda_):
        self.path_type = path_type
        self.lambda_= lambda_
        self.device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

    def fit(self, X, y, verbose=False):
        # Data Processing
        X = torch.tensor(X.values, dtype=torch.float, device=self.device)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        y = torch.tensor(y.values.squeeze(), dtype=torch.float, device=self.device)
        
        # Model Parameters
        self.theta = nn.Linear(X.shape[1], 1, dtype=torch.float, device=self.device)

        # Training
for i in range(-1, 6):
                init_lr = 0.1
                lambi = self.lambda_ * (np.exp(i) / (1 + np.exp(i)) if i < 5 else 1)
                rel_err = 1e-11 if i==5 else 1e-5
                
                if verbose: 
print(f"Lambda = {lambi.item():.4f}")
                self.train_model(X, y, lambi, init_lr, rel_err, verbose) 
        if verbose: print("MODEL FITTED !")

    def forward(self, X):
        if X.device != self.device:
            X = X.to(self.device)

        return self.theta(X).squeeze()


    def train_model(self, X, y, lambda_, init_lr, rel_err, verbose):
        loss_fn = LinearRegressionLoss(lambda_, nu).to(self.device)
        train_score_fn = nn.MSELoss(reduction = 'mean').to(self.device)
        
        epoch, last_loss = 0, np.inf
        optimizer = FISTA(params=self.theta.parameters(), lr=init_lr, lambda_=lambda_)

        lr_factor = 0.9
        max_epochs = 10000

        while epoch < max_epochs:
            optimizer.zero_grad()

            y_pred = self.forward(X)
            loss, bare_loss = loss_fn(y_pred, y, self.theta)
            loss = loss.detach()
            train_loss = train_score_fn(y_pred, y).detach()

            if loss > last_loss: 
                learning_rate = optimizer.param_groups[0]['lr']
                optimizer = FISTA(params=self.theta.parameters(), lr=learning_rate*lr_factor, lambda_=lambda_)

            if epoch % 20 == 0:
                if verbose:
                    print(f"\tEpoch: {epoch} | Loss: {loss.item():.5f}")

                if epoch > 0 and abs(loss - last_loss) / loss < rel_err:
                    if verbose: print(f"\n\t Descent stopped: loss is no longer decreasing significantly.\n")
                    break
                
                last_loss = loss

            epoch += 1
            bare_loss.backward()
            optimizer.step()

            if epoch == max_epochs and verbose: print("FISTA descent stopped: maximum iterations reached") 

And the optimizer:

class FISTA(torch.optim.Optimizer):
    def __init__(self, params, lr, lambda_):
        self.lr = lr
        self.lambda_ = lambda_
        defaults = dict(lr=lr)
        super(FISTA, self).__init__(params, defaults)
        

    def shrinkage_operator(self, u, lambda_):
        '''Applies the shrinkage operator to a PyTorch tensor.'''
            return u.sign() * torch.clamp(u.abs() - lambda_, min=0.0)
        
    @torch.no_grad()
    def step(self, closure=None):
        '''Performs a single optimization step.'''
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad
                state = self.state[p]

                if 'x_prev' not in state:
                    # We use .detach() to ensure we do not track history
                    state['x_prev'] = p.detach().clone()
                    state['y_prev'] = p.detach().clone()
                    state['t_prev'] = torch.tensor(1., device=p.device)

                x_prev, y_prev, t_prev = state['x_prev'], state['y_prev'], state['t_prev']

                x_next = self.shrinkage_operator(y_prev - self.lr * grad, self.lr * self.lambda_)
                t_next = (1. + torch.sqrt(1. + 4. * t_prev ** 2)) / 2.
                y_next = x_next + ((t_prev - 1) / t_next) * (x_next - x_prev)

                state['x_prev'].copy_(x_next)
                state['y_prev'].copy_(y_next)
                state['t_prev'].copy_(t_next)

                p.copy_(x_next)

        return loss
  1. Why is the GPU underutilized, and why are operations like aten::mul and aten::abs still performed on the CPU?
  2. How can I ensure that more computation, especially tensor operations, is offloaded to the GPU to improve training efficiency?

Any insights or suggestions on how to better leverage GPU capabilities in this PyTorch model would be greatly appreciated!


Edit : I added a ‘device’ argument to the model class such that if given, the model’s device is forced. Results:

t = time.time()
model = LinearModel(path_type=0)
model.fit(X, y, False)
print(time.time()-t)

-->0.7786831855773926

t = time.time()
model = LinearModel(path_type=0, device='cpu')
model.fit(X, y, False)
print(time.time()-t)

--> 0.800177001953125