Very low GPU usage (slow training) in comparison to Tensorflow

Hi,

When I train my model (a simple MLP) using pytorch and DataLoader using large batches of data (128000) with the whole dataset which contain 18.515.415 observations, the GPU (NVIDIA A40) memory occupation is only about 500Mb and the frequency does not go beyond 10% during a very short interval and most of the process is done on CPU to retrieve the next batches from DataLoader.

Once I have discovered the arguments of DataLoader which are pin_memory, num_workers and using also non_blocking argument when transferring to GPU, the usage of GPU is continuous, but still, only 10% of GPU is used with very low memory occupation. It has reduced the training time from 13 hours to 50 minutes using 16 workers and for 200 epochs.

I have also measured the time passed for retrieving a batch from DataLoader which is around 5 seconds for the first batch, but much faster thereafter.

Here is an example of my code:

from typing import Union
import numpy as np

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

class MyCustomFullyConnected(nn.Module):
    def __init__(self,
                 name: str="MyCustomFC",
                 input_size: int=None,
                 output_size: int=None,
                 hidden_sizes: tuple=(100,100,),
                 activation=F.relu
                ):
        super().__init__()
        self.name = name
        
        self.activation = activation

        self.input_size = input_size
        self.output_size = output_size
        self.hidden_sizes = hidden_sizes

        # model architecture
        self.input_layer = nn.Linear(self.input_size, self.hidden_sizes[0])
        self.fc_layers = nn.ModuleList([nn.Linear(in_f, out_f) \
                                        for in_f, out_f in zip(hidden_sizes[:-1], self.hidden_sizes[1:])])
        self.output_layer = nn.Linear(hidden_sizes[-1], self.output_size)

    def forward(self, data):
        """The forward pass of the model
        """
        out = self.input_layer(data)
        for _, fc_ in enumerate(self.fc_layers):
            out = fc_(out)
            out = self.activation(out)
        out = self.output_layer(out)
        return out

def train(model, train_loader, val_loader=None, epochs=100, lr=3e-4, device="cpu"):
    train_losses = []
    val_losses = []
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.MSELoss()
    pbar = tqdm(range(epochs))
    for epoch in pbar:
        model.train()
        total_loss = 0
        pbar_batch=tqdm(train_loader)
        for batch in pbar_batch:
            data, target = batch
            data = data.to(device, non_blocking=True)
            target = target.to(device, non_blocking=True)
            optimizer.zero_grad()
            prediction = model(data)
            loss = loss_function(prediction, target)
            loss.backward()
            optimizer.step()
            total_loss += (loss * len(data))
        if val_loader is not None:
            val_loss = validate(model, val_loader, device)
            val_losses.append(val_loss)
        mean_loss = total_loss.item() / len(train_loader.dataset)
        print(f"Train Epoch: {epoch}   Avg_Loss: {mean_loss:.5f}")
        train_losses.append(mean_loss)
    return model, train_losses, val_losses

if __name__ == "__main__":
    device = torch.device("cuda:0")
    torch_dataset = MyDataSet(torch.tensor(features), torch.tensor(targets), device=device)
    data_loader = DataLoader(torch_dataset, batch_size=128000, shuffle=True, pin_memory=True, num_workers=16)

    model = MyCustomFullyConnected(input_size=7,
                                   output_size=4,
                                   hidden_sizes=(64,64,8,64,64,64,8,64,64),
                                   activation=F.relu
                                  )
    model.to(device)
    model.float()
    model, train_losses, _ = train(model, train_loader, epochs=200, device=device, lr=3e-4)

However, when I use Tensorflow with the same data and exactly the same configuration of neural network architecture, it tooks only 4 minutes to complete 200 epochs. When looking at Nvidia-smi, this time the GPU memory is fully occupied and the GPU frequency is continuously around 85% of usage. When I look also the CPU usage (using Tensorflow), it is not as used as pytorch uses the CPU when using workers.

Here is the code for Tensorflow keras:


from tensorflow import keras

def build_model(input_size=7, output_size=4, hidden_layers=(64,64,8,64,64,64,8,64,64)):      
        input_ = keras.layers.Input(shape=(input_size,), name="input")
        x = input_
        for layer_id, layer_size in enumerate(hidden_layers):
            x = self.layer(layer_size, name=f"layer_{layer_id}")(x)
            x = keras.layers.Activation("relu", name=f"activation_{layer_id}")(x)
        output_ = keras.layers.Dense(output_size)(x)
        self._model = keras.Model(inputs=input_,
                                  outputs=output_,
                                  name="my_model")
        return _model

def train(self,
              model,
              train_dataset,
              val_dataset= None,
              **kwargs):
        
        processed_x, processed_y = train_dataset
        
        if val_dataset is not None:
            processed_x_val, processed_y_val = val_dataset
            validation_data = (processed_x_val, processed_y_val)
       
        optimizer = keras.optimizers.Adam(learning_rate=3e-4)

        model.compile(optimizer=optimizer,
                      loss="mse",
                      metrics=["mae"])        

        history_callback = model.fit(x=processed_x,
                                     y=processed_y,
                                     validation_data=validation_data,
                                     epochs=200,
                                     batch_size=128000,
                                     shuffle=True)
        return history_callback

if __name__ == "__main__":
    model = build_model()
    history = train_model(model, train_dataset, val_dataset)

Could anyone please state how I can accelerate the Pytorch learning process (50 minutes with 16 workers) to obtain the same acceleration performance as Tensorflow (4 minutes)?

Thanks in advance for your help

Your use case is most likely CPU-limited and you can profile it to verity this claim.
In this case you could use CUDAGraphs (assuming your input shapes are static) as described here, which would record the kernel launches and launch the graph only thus reducing the CPU workload.