Hi,
When I train my model (a simple MLP) using pytorch and DataLoader
using large batches of data (128000) with the whole dataset which contain 18.515.415 observations, the GPU (NVIDIA A40) memory occupation is only about 500Mb and the frequency does not go beyond 10% during a very short interval and most of the process is done on CPU to retrieve the next batches from DataLoader.
Once I have discovered the arguments of DataLoader which are pin_memory
, num_workers
and using also non_blocking
argument when transferring to GPU, the usage of GPU is continuous, but still, only 10% of GPU is used with very low memory occupation. It has reduced the training time from 13 hours to 50 minutes using 16 workers and for 200 epochs.
I have also measured the time passed for retrieving a batch from DataLoader which is around 5 seconds for the first batch, but much faster thereafter.
Here is an example of my code:
from typing import Union
import numpy as np
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
class MyCustomFullyConnected(nn.Module):
def __init__(self,
name: str="MyCustomFC",
input_size: int=None,
output_size: int=None,
hidden_sizes: tuple=(100,100,),
activation=F.relu
):
super().__init__()
self.name = name
self.activation = activation
self.input_size = input_size
self.output_size = output_size
self.hidden_sizes = hidden_sizes
# model architecture
self.input_layer = nn.Linear(self.input_size, self.hidden_sizes[0])
self.fc_layers = nn.ModuleList([nn.Linear(in_f, out_f) \
for in_f, out_f in zip(hidden_sizes[:-1], self.hidden_sizes[1:])])
self.output_layer = nn.Linear(hidden_sizes[-1], self.output_size)
def forward(self, data):
"""The forward pass of the model
"""
out = self.input_layer(data)
for _, fc_ in enumerate(self.fc_layers):
out = fc_(out)
out = self.activation(out)
out = self.output_layer(out)
return out
def train(model, train_loader, val_loader=None, epochs=100, lr=3e-4, device="cpu"):
train_losses = []
val_losses = []
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = nn.MSELoss()
pbar = tqdm(range(epochs))
for epoch in pbar:
model.train()
total_loss = 0
pbar_batch=tqdm(train_loader)
for batch in pbar_batch:
data, target = batch
data = data.to(device, non_blocking=True)
target = target.to(device, non_blocking=True)
optimizer.zero_grad()
prediction = model(data)
loss = loss_function(prediction, target)
loss.backward()
optimizer.step()
total_loss += (loss * len(data))
if val_loader is not None:
val_loss = validate(model, val_loader, device)
val_losses.append(val_loss)
mean_loss = total_loss.item() / len(train_loader.dataset)
print(f"Train Epoch: {epoch} Avg_Loss: {mean_loss:.5f}")
train_losses.append(mean_loss)
return model, train_losses, val_losses
if __name__ == "__main__":
device = torch.device("cuda:0")
torch_dataset = MyDataSet(torch.tensor(features), torch.tensor(targets), device=device)
data_loader = DataLoader(torch_dataset, batch_size=128000, shuffle=True, pin_memory=True, num_workers=16)
model = MyCustomFullyConnected(input_size=7,
output_size=4,
hidden_sizes=(64,64,8,64,64,64,8,64,64),
activation=F.relu
)
model.to(device)
model.float()
model, train_losses, _ = train(model, train_loader, epochs=200, device=device, lr=3e-4)
However, when I use Tensorflow with the same data and exactly the same configuration of neural network architecture, it tooks only 4 minutes to complete 200 epochs. When looking at Nvidia-smi, this time the GPU memory is fully occupied and the GPU frequency is continuously around 85% of usage. When I look also the CPU usage (using Tensorflow), it is not as used as pytorch uses the CPU when using workers.
Here is the code for Tensorflow keras:
from tensorflow import keras
def build_model(input_size=7, output_size=4, hidden_layers=(64,64,8,64,64,64,8,64,64)):
input_ = keras.layers.Input(shape=(input_size,), name="input")
x = input_
for layer_id, layer_size in enumerate(hidden_layers):
x = self.layer(layer_size, name=f"layer_{layer_id}")(x)
x = keras.layers.Activation("relu", name=f"activation_{layer_id}")(x)
output_ = keras.layers.Dense(output_size)(x)
self._model = keras.Model(inputs=input_,
outputs=output_,
name="my_model")
return _model
def train(self,
model,
train_dataset,
val_dataset= None,
**kwargs):
processed_x, processed_y = train_dataset
if val_dataset is not None:
processed_x_val, processed_y_val = val_dataset
validation_data = (processed_x_val, processed_y_val)
optimizer = keras.optimizers.Adam(learning_rate=3e-4)
model.compile(optimizer=optimizer,
loss="mse",
metrics=["mae"])
history_callback = model.fit(x=processed_x,
y=processed_y,
validation_data=validation_data,
epochs=200,
batch_size=128000,
shuffle=True)
return history_callback
if __name__ == "__main__":
model = build_model()
history = train_model(model, train_dataset, val_dataset)
Could anyone please state how I can accelerate the Pytorch learning process (50 minutes with 16 workers) to obtain the same acceleration performance as Tensorflow (4 minutes)?
Thanks in advance for your help