I am using simple autoencoder consisting of only Convolution layers. While training, the data is being moved to GPU because I can see the rise in gpu memory but the training loop is not utilizing the GPU for training.
My train dataset size is somewhere around 800 images. I even tried increasing the batch size from 32 to 64 and even 128, still no improvement
Here’s the code:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = ImageFolder(
root="dataset_for_cvae",
transform=transforms,
)
# Create the splits
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
dataset, [0.8, 0.1, 0.1], generator=generator
)
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)
class ConvAutoencoder(nn.Module):
def __init__(self):
super(ConvAutoencoder, self).__init__()
# Encoder
self.encoder = nn.Sequential(# Bunch of Conv2d layers)
# Decoder
self.decoder = nn.Sequential( # Bunch of Conv2d layers)
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
def fit(
self, train_loader, num_epochs, model, criterion, optimizer, val_loader=None
):
train_history = []
val_history = []
best_val_loss = float("inf")
for epoch in range(num_epochs):
# Training phase
start = time.time()
print(f"Epoch: {epoch+1}/{num_epochs}")
model.train()
epoch_loss = 0
num_batches = 0
for data in train_loader:
img, _ = data
img = img.to(device) # Move images to GPU
optimizer.zero_grad()
recons = model(img)
loss = criterion(recons, img)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
num_batches += 1
avg_train_loss = epoch_loss / num_batches
train_history.append(avg_train_loss)
print(
f"Epoch: {epoch+1}/{num_epochs}, "
f"Train Loss: {avg_train_loss:.6f}"
)
return {"train_loss": train_history, "val_loss": val_history}
# Initialize the model, loss function and optimizer
model = ConvAutoencoder()
model = model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Train the model
history = model.fit(
train_loader,
num_epochs=10, # increased epochs
model=model,
criterion=criterion,
optimizer=optimizer,
val_loader=val_loader, # added validation loader
)