The data is getting moved to GPU but GPU is not utilized for training

I am using simple autoencoder consisting of only Convolution layers. While training, the data is being moved to GPU because I can see the rise in gpu memory but the training loop is not utilizing the GPU for training.

My train dataset size is somewhere around 800 images. I even tried increasing the batch size from 32 to 64 and even 128, still no improvement

Here’s the code:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = ImageFolder(
    root="dataset_for_cvae",
    transform=transforms,
)

# Create the splits
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [0.8, 0.1, 0.1], generator=generator
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)


class ConvAutoencoder(nn.Module):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(# Bunch of Conv2d layers)
        # Decoder
        self.decoder = nn.Sequential( # Bunch of Conv2d layers)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    def fit(
        self, train_loader, num_epochs, model, criterion, optimizer, val_loader=None
    ):
        train_history = []
        val_history = []
        best_val_loss = float("inf")

        for epoch in range(num_epochs):
            # Training phase
            start = time.time()
            print(f"Epoch: {epoch+1}/{num_epochs}")
            model.train()
            epoch_loss = 0
            num_batches = 0

            for data in train_loader:
                img, _ = data
                img = img.to(device)  # Move images to GPU
                optimizer.zero_grad()
                recons = model(img)
                loss = criterion(recons, img)

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                num_batches += 1

            avg_train_loss = epoch_loss / num_batches
            train_history.append(avg_train_loss)

            print(
                f"Epoch: {epoch+1}/{num_epochs}, "
                f"Train Loss: {avg_train_loss:.6f}"
            )

        return {"train_loss": train_history, "val_loss": val_history}


# Initialize the model, loss function and optimizer
model = ConvAutoencoder()
model = model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
history = model.fit(
    train_loader,
    num_epochs=10,  # increased epochs
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    val_loader=val_loader,  # added validation loader
)

How did you verify the GPU isn’t used during training?

I’m using kaggle for training and there it shows how much gpu is being utilised for training (for example: 30%, 98%, etc)

In that case profile your workload to narrow down which part of your code is the bottleneck.