Profile your code and check if your workload is e.g. CPU-bound (you should see whitespaces between the CUDA kernels).
The main problem ended up being the underlying Dataset
but not how you’d expect. The GPU was waiting on the data from the DataLoader
but increasing the number of workers didn’t help. Each loader was invoking torch.randn()
to generate dummy data but it turns out this is does not generate data fast enough no longer how many workers you throw at it. Upon increasing the call with torch.cuda.FloatTensor(size).normal_()
GPU usage shot up. I experimented with different Dataset
lengths, batch_size
and num_workers
. Eventually I was able to increase GPU utilization to 50% and this is the best I could manage.
The GPU memory (10GB) was the limiting factor on the Dataset
length and batch_size
I could play with.
The profiler shows that the DataLoader
is still the bottleneck, but I can’t think of any other way to improve this further. Can you?
Here is my code:
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.cuda.FloatTensor(length, size).normal_()
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.batch_size = 4096
self.feature_size = 512
self.layer1 = torch.nn.Linear(self.feature_size, self.feature_size)
self.layer2 = torch.nn.Linear(self.feature_size, self.feature_size)
self.layer3 = torch.nn.Linear(self.feature_size, 2)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
return self.layer3(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.parameters(), lr=0.1)
def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx):
# Set gradients to None instead of zero to improve performance
optimizer.zero_grad(set_to_none=True)
def train_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.feature_size, self.batch_size * 256),
batch_size=self.batch_size,
pin_memory=False, num_workers=5, persistent_workers=True)
def val_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.feature_size, self.batch_size * 6),
batch_size=self.batch_size,
pin_memory=False, num_workers=2, persistent_workers=True)
def test_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.feature_size, self.feature_size * 10),
batch_size=self.batch_size,
pin_memory=False, num_workers=2, persistent_workers=True)
def run():
model = BoringModel()
# profiler = PyTorchProfiler(with_stack=True)
profiler = None
trainer = Trainer(
gpus=GPUS,
default_root_dir=os.getcwd(),
num_sanity_val_steps=0,
max_epochs=10_000,
enable_model_summary=False,
detect_anomaly=False,
auto_select_gpus=True,
profiler=profiler
)
trainer.fit(model)
if __name__ == "__main__":
run()