I am a late starter in deep learning. I want to generate pap smear test cervical cancer images to augment a small dataset of 74 images
train_data - Google Drive .
My written code often starts training normally but freezes after several steps. My guts tells me i do not have the right hyper-parameter tuning . What am doing wrong? please i need help!
from __future__ import print_function
# import argparse
# import os
# import random
import torch
import torch.nn as nn
import torch.nn.parallel
# import torch.nn.fuctional as F
# import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
# from torch.autograd import variable
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
# from torchvision.utils import save_image
from torchvision.utils import make_grid
# import matplotlib.animation as animation
# from IPython.display import HTML
def display_images(image_tensor, num_images=37, size=(3, 64, 64)):
'''
Function for visualizing images: Given a tensor of images,
number of images, and size per image, plots and prints the
images in an uniform grid.
'''
image_tensor = (image_tensor + 1) / 2
image_unflat = image_tensor.detach().cpu()
image_grid = make_grid(image_unflat[:num_images], nrow=8)
plt.imshow(image_grid.permute(1, 2, 0).squeeze())
plt.show()
dataroot = "train_data"
workers = 2
batch_size = 37
image_size = 64
nc = 3
nz = 100
ngf = 64
ndf = 64
ngpu = 0
n_samples = 74
dataset = dset.ImageFolder(root=dataroot,
transform=transforms.Compose([
transforms.Resize(image_size),
transforms.CenterCrop(image_size),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5,
0.5,
0.5)),
]))
# Create the dataloader
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
shuffle=True, num_workers=workers)
# Decide which device we want to run on
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0)
else "cpu")
# Plot some training images
real_batch = next(iter(dataloader))
# Display training set Batch image
plt.axis("off")
plt.title("Training Images")
plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:37],
padding=2, normalize=True).cpu(), (1, 2, 0)))
plt.show()
# initialize the weight to normal distribution with mean 0 and Std 0.02
def weights_init(m):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
if isinstance(m, nn.BatchNorm2d):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
torch.nn.init.constant_(m.bias, val=0)
# Generator random noise
def get_noise(n_samples, nz, device='cpu'):
return torch.randn(n_samples, nz, 1, 1, device=device)
# Create the Generator
class Generator(nn.Module):
def __init__(self, ngpu):
super(Generator, self).__init__()
self.ngpu = ngpu
self.main = nn.Sequential(
# input is Z, going into a convolution
nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
# state size. (ngf*8) x 4 x 4
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
# state size. (ngf*4) x 8 x 8
nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
# state size. (ngf*2) x 16 x 16
nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
# state size. (ngf) x 32 x 32
nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
nn.Tanh()
# state size. (nc) x 64 x 64
)
def forward(self, input):
output = self.main(input)
return output
# Create the Generator Instance
netG = Generator(ngpu).to(device)
# Handle multi-gpu if desired
if (device.type == 'cuda') and (ngpu > 1):
netG = nn.DataParallel(netG, list(range(ngpu)))
netG.apply(weights_init)
# Print the model
print(netG)
# Create the Critic
class Discriminator(nn.Module):
def __init__(self, ngpu):
super(Discriminator, self).__init__()
self.ngpu = ngpu
self.main = nn.Sequential(
# input is (nc) x 64 x 64
nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf) x 32 x 32
nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
nn.InstanceNorm2d(ndf * 2, affine=True),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*2) x 16 x 16
nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
nn.InstanceNorm2d(ndf * 4, affine=True),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*4) x 8 x 8
nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
nn.InstanceNorm2d(ndf * 8, affine=True),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*8) x 4 x 4
nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
)
def forward(self, input):
output = self.main(input)
return output
# Create the Critic Instance
netCritic = Discriminator(ngpu).to(device)
# Handle multi-gpu if desired
if (device.type == 'cuda') and (ngpu > 1):
netCritic = nn.DataParallel(netCritic, list(range(ngpu)))
netCritic.apply(weights_init)
# Print the model
print(netCritic)
# Define the loss and optimizer for generator and critic
# Setup RMSprop optimizers for both Gen and Critic
lr = 5e-5
criterion = nn.BCEWithLogitsLoss()
optimizerCritic = optim.RMSprop(netCritic.parameters(), lr=lr)
optimizerG = optim.RMSprop(netG.parameters(), lr=lr)
# Setup Adam optimizers for both G and Critic
# lr = 1e-4
# optimizerCritic = optim.Adam(netCritic.parameters(), lr=lr)
# optimizerG = optim.Adam(netG.parameters(), lr=lr)
# Declare Gradient penalty
def gradient_penalty(netCritic, real_image, fake_image, device="cpu"):
batch_size, channel, height, width = real_image.shape
# alpha is selected randomly between 0 and 1
alpha = torch.rand(batch_size, 1, 1, 1).repeat(1, channel, height, width)
# interpolated image=randomly weighted average between a real and fake
# image
# interpolated image ← alpha *real image + (1 − alpha) fake image
interpolated_image = (alpha*real_image) + (1-alpha) * fake_image
# calculate the critic score on the interpolated image
interpolated_score = netCritic(interpolated_image)
# take the gradient of the score wrt to the interpolated image
gradient = torch.autograd.grad(inputs=interpolated_image,
outputs=interpolated_score,
retain_graph=True,
create_graph=True,
grad_outputs=torch.ones_like
(interpolated_score)
)[0]
gradient = gradient.view(gradient.shape[0], -1)
gradient_norm = gradient.norm(2, dim=1)
gradient_penalty = torch.mean((gradient_norm - 1)**2)
return gradient_penalty
# Training the WGAN with Gradient penalty
n_epochs = 2000
cur_step = 0
LAMBDA_GP = 10
display_step = 50
CRITIC_ITERATIONS = 5
nz = 100
for epoch in range(n_epochs):
# Dataloader returns the batches
for real_image, _ in tqdm(dataloader):
cur_batch_size = real_image.shape[0]
real_image = real_image.to(device)
for _ in range(CRITIC_ITERATIONS):
fake_noise = get_noise(cur_batch_size, nz, device=device)
fake = netG(fake_noise)
critic_fake_pred = netCritic(fake).reshape(-1)
critic_real_pred = netCritic(real_image).reshape(-1)
# Calculate gradient penalty on real and fake images
# generated by generator
gp = gradient_penalty(netCritic, real_image, fake, device)
critic_loss = -(torch.mean(critic_real_pred)
- torch.mean(critic_fake_pred)) + LAMBDA_GP * gp
netCritic.zero_grad()
# To make a backward pass and retain the intermediary results
critic_loss.backward(retain_graph=True)
optimizerCritic.step()
# Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
gen_fake = netCritic(fake).reshape(-1)
gen_loss = -torch.mean(gen_fake)
netG.zero_grad()
gen_loss.backward()
# Update optimizer
optimizerG.step()
# Visualization code
if cur_step % display_step == 0 and cur_step > 0:
print(f"Step{cur_step}: GenLoss: {gen_loss}: CLoss: {critic_loss}")
display_images(fake)
display_images(real_image)
gen_loss = 0
critic_loss = 0
cur_step += 1
# Save generated fake images into a folder
img_list= [ ]
img_list.append(fake)
for i in range(len(fake)):
vutils.save_image(img_list[-1][i],
"GenPy_Image/gpi%d.jpg" % i, normalize=True)
A similar code was used on MNIST dataset with epoch = 50 and display_step = 500. But values did not work on my code. it often stop training after few steps.
I tried changing optimizer to Adams with the recommended learning rate, epoch 2000 and display_step 100 and later 500. It often stop training and freezes after several epochs like 500 steps or above. And without desired image output.
I want it to train without interruption until it generates the desired images. I am presently training on CPU.