Loading data taking a lot of time

I have tried to make my training faster for the whole weekend but it still needs about 3 hours for one epoch which I don’t think is normal as my model is not that deep. My dataset has around 200k images and they are all in one folder, without labels. They have different sizes with the smalles being 128x128 but I crop them in the middle and resize them to 64x64. The backbone is similar to the DCGAN pytorch tutorial which is where I started and then did some modifications as to have it running on my GPU. However this did not speed my training at all and my GPU is almost not even involved (0~1% it is a GeforceGTX 1080 ). I have been reading similar topics the whole weekend and tried to adapt my code acordingly, writing my own custom dataset, having cuda(non_blocking) etc. but nothing helped. A often asked question was if HDD or SDD was being used, I am using an HDD.

Timing the iterations on the dataloader through batches, the average is around 180 seconds per batch, sometimes jumping to ~325 seconds.

My full code:

ImageFile.LOAD_TRUNCATED_IMAGES = True

matplotlib.rcParams[‘animation.embed_limit’] = 2**128

start_time = time.time()

seed for reproducibility

seed = 12
random.seed(seed)
th.manual_seed(seed)
#number of gpu
ngpu = 1

size of noise vector for the generator to start with

nz = 100
#number of channels in image 1 -> gray 3 -> RGB
nc = 3
#learning rate
lr = 0.001
#number of epochs
nr_epochs = 1

batch size

batch_size = 64
#size of the image to work with
image_size = 64

number of workers for the data loader

workers = 3
#dataset = “./dataset/Images/”
dataset = “./dataset/”

Tensor = th.cuda.FloatTensor

class JTA200(th.utils.data.Dataset):
def init(self, image_paths, transform=None):
self.root = image_paths
self.transform = transform
self.image_paths = []
for path in os.listdir(self.root):
full_path = os.path.join(self.root, path)
if os.path.isfile(full_path):
self.image_paths.append(full_path)
def getitem(self, index):
image_path = self.image_paths[index]
x = Image.open(image_path)
x = x.convert(‘RGB’)
if self.transform is not None:
x = self.transform(x)
return x

def __len__(self):
    return len(self.image_paths)

def to_img(x):
out = 0.5 * (x + 1)
out = out.clamp(0, 1)
out = out.view(-1, nc, image_size, image_size)
return out

def weights_init(m):
classname = m.class.name
if classname.find(‘Conv’) != -1:
th.nn.init.normal_(m.weight.data, 0.0, 0.02).to(device)
elif classname.find(‘BatchNorm’) != -1:
th.nn.init.normal_(m.weight.data,1.0,0.02).to(device)
th.nn.init.constant_(m.bias.data, 0).to(device)

class Flatten(th.nn.Module):
def forward(self, x):
return x.view(x.shape[0], -1)

class Descriminator(th.nn.Module):
def init(self,kernel_size, p):
super(Descriminator,self).init()
self.conv = th.nn.Sequential(
th.nn.Dropout2d§,
th.nn.Conv2d(nc,64,kernel_size, padding=1,stride=2), #32x32x64
th.nn.LeakyReLU(0.2, True),
th.nn.Dropout2d§,
th.nn.Conv2d(64, 64, kernel_size,padding=1, stride=2), #16x16x64
th.nn.BatchNorm2d(64),
th.nn.LeakyReLU(0.2, True),
th.nn.Dropout2d§,
th.nn.Conv2d(64, 32, kernel_size,padding=1,stride=2), #8x8x32
th.nn.BatchNorm2d(32),
th.nn.LeakyReLU(0.2, True),
Flatten(),
th.nn.Dropout2d§,
th.nn.Linear(2048,84),
th.nn.LeakyReLU(0.2, True),
th.nn.Dropout2d§,
th.nn.Linear(84,32),
th.nn.LeakyReLU(0.2, True),
th.nn.Linear(32,1),
th.nn.Sigmoid()
)
def forward(self, input):
y = self.conv(input)
return y

class Generator(th.nn.Module):
def init(self):
super(Generator,self).init()
self.deconv = th.nn.Sequential(
th.nn.ConvTranspose2d(nz,64,4,1,0), # 100x64x1
th.nn.BatchNorm2d(64),
th.nn.ReLU(True),
th.nn.ConvTranspose2d(64,64,4,2,1), # 100x64x1
th.nn.BatchNorm2d(64),
th.nn.ReLU(True),
th.nn.ConvTranspose2d(64,64,4,2,1), # 100x64x1
th.nn.BatchNorm2d(64),
th.nn.ReLU(True),
th.nn.ConvTranspose2d(64,32,4,2,1), # 100x32x1
th.nn.BatchNorm2d(32),
th.nn.ReLU(True),
th.nn.ConvTranspose2d(32,nc,4,2,1), # 100x3x1
#to return data in -1 to 1 range tanh is used
th.nn.Tanh()
)

def forward(self, input):
    generated_img = self.deconv(input)
    return generated_img

#Decide whether to run it on gpu or cpu
if name == ‘main’:
cudnn.benchmark = True

device = th.device("cuda" if (th.cuda.is_available() and ngpu > 0) else "cpu")
if th.cuda.is_available(): th.cuda.manual_seed(12)

dataset = dset.ImageFolder(root=dataset,
                           transform=transforms.Compose([
                               transforms.Resize(image_size),
                               transforms.CenterCrop(image_size),
                               transforms.ToTensor(),
                               transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
                           ]))
#dataset = JTA200(dataset, transform=transforms.Compose([transforms.Resize(image_size),
#                                                             transforms.CenterCrop(image_size),
#                                                             transforms.ToTensor(),
#                                                             transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))]))
#create the data loader
dataloader = th.utils.data.DataLoader(dataset, batch_size = batch_size,shuffle=True,
                                      num_workers=workers, pin_memory= True)

# Check images
real_batch = next(iter(dataloader))
real_batch = [_.cuda(non_blocking = True) for _ in real_batch]
plt.figure(figsize=(8,8))
plt.axis("off")
plt.title("Training images")
plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64],padding = 2,
                                    normalize=True).cpu(),(1,2,0)))

plt.imshow(np.transpose(vutils.make_grid(real_batch.to(device)[:64], padding=2,

normalize=True).cpu(),(1,2,0)))

plt.show()

#kernel size for the convolutions / deconvolutions
kernel_size = 3
#Define Dropout probability
p = 0.5

generator = Generator().to(device)
# Handle multi-gpu
if (device.type == 'cuda') and (ngpu >= 1):
    netG = th.nn.DataParallel(generator, list(range(ngpu)))

if (glob.glob("./model/G*")):
    print("Loading Generator ...")
    generator.load_state_dict(th.load(glob.glob("./model/G*")[0]))
    generator.eval()
else:
    # initialize weights for the generator
    generator.apply(weights_init)

descriminator = Descriminator(kernel_size,p).to(device)
# same as before for operations in the descriminator
if (device.type == 'cuda') and (ngpu >= 1):
    descriminator = th.nn.DataParallel(descriminator,list(range(ngpu)))

if (glob.glob("./model/D*")):
    print("Loading discriminator ...")
    descriminator.load_state_dict(th.load(glob.glob("./model/D*")[0]))
    descriminator.eval()
else:
    descriminator.apply(weights_init)

# Initialize the loss function
loss = th.nn.BCELoss().cuda()

# batch of latent vectors to visualize generators progress
fixed_noise = th.randn(batch_size,nz,1,1,device=device)

ground_truth = 1
fake_label = 0

optimizerD = th.optim.Adam(descriminator.parameters(),lr=lr, betas=(0.5,0.999))
optimizerG = th.optim.Adam(generator.parameters(),lr=lr, betas=(0.5,0.999))

# Training
# statistical info
img_list = []
G_losses = []
D_losses = []
epochs = []
loss_legend = ['Discriminator', 'Generator']

iters = 0

print("------Training started------")
time_batch = 0
for epoch in range(nr_epochs):
    for i,data in enumerate(dataloader,0):
        #time_batch = time.time()
        # Train with real images for discriminator
        descriminator.zero_grad() # reset gradients to not mix between batches
        # format bach
        real_cpu = data[0].to(device)
        #real_cpu = data.to(device)
        b_size = real_cpu.size(0)
        label = th.full((b_size,), ground_truth,device=device)
        # forward propagate in Discriminator
        output = descriminator(real_cpu).view(-1)
        #calculate error and backpropagate
        loss_D_real = loss(output,label) # the lower the better it spots real images
        loss_D_real.backward()
        D_x = output.mean().item()
        # Train with fake images
        noise = th.randn(b_size,nz,1,1, device=device)
        # generate images with G
        fake = generator(noise)
        label.fill_(fake_label)
        #Classify all fake images
        output = descriminator(fake.detach()).view(-1)
        # Calculate classifier loss
        loss_D_fake = loss(output,label) # how well the descriminator spots fakes
        loss_D_fake.backward()
        D_G_z1 = output.mean().item() # average of descriminator guesses on fake data , should be 0.5
        # add gradients real and fake
        lossD = loss_D_real+loss_D_fake
        #update D
        optimizerD.step()

        # -------------- Generator ---------------
        generator.zero_grad()
        label.fill_(ground_truth)
        output = descriminator(fake).view(-1)
        lossG = loss(output,label) # the lower, the higher the chances D is being fooled
        lossG.backward()
        D_G_z2 = output.mean().item() #mean of how well discriminators spot real images, should be 0.5
        optimizerG.step()
        G_losses.append(lossG.item())
        D_losses.append(lossD.item())
        epochs.append(epoch + i / len(dataloader))

        if (iters % 25 == 0) or ((epoch == nr_epochs-1) and (i == len(dataloader)-1)):
            count = 0
            with th.no_grad():
                fake = generator(fixed_noise).detach().cpu()
                if (iters % 300 == 0):
                    for i in range(batch_size - 1):
                        # img = Image.fromarray(fake[i], 'RGB')
                        img = fake[i].data
                        img = to_img(img)
                        vutils.save_image(img, "./Generated/{}.png".format(count))
                        count += 1
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f Time: %f'
                  % (epoch, nr_epochs, i, len(dataloader),
                     lossD.item(), lossG.item(), D_x, D_G_z1, D_G_z2, time.time() - time_batch))

            img_list.append(vutils.make_grid(fake,padding=2,normalize=True))
            time_batch = time.time()
        iters+=1
th.save(descriminator.state_dict(), "./model/Dmodel.pth.tar")
th.save(generator.state_dict(), "./model/Gmodel.pth.tar")
plt.figure(figsize=(10,5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

fig = plt.figure(figsize=(8,8))
plt.axis("off")
ims = [[plt.imshow(np.transpose(i,(1,2,0)), animated= True)] for i in img_list]
ani = animation.ArtistAnimation(fig, ims, interval=1000,repeat_delay=1000,blit=True)
#ani.save("animation.mp4")
#HTML(ani.to_jshtml())
plt.show()

print("--- %s seconds for training ---" % (time.time() - start_time))
# Grab a batch of real images from the dataloader
real_batch = next(iter(dataloader))
#next_batch = [_.cuda(non_blocking = True) for _ in real_batch]
# Plot the real images
plt.figure(figsize=(15,15))
plt.subplot(1,2,1)
plt.axis("off")
plt.title("Real Images")
plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=5, normalize=True).cpu(),(1,2,0)))

plt.imshow(np.transpose(vutils.make_grid(real_batch.to(device)[:64], padding=5, normalize=True).cpu(),(1,2,0)))

# Plot the fake images from the last epoch
plt.subplot(1,2,2)
plt.axis("off")
plt.title("Fake Images")
plt.imshow(np.transpose(img_list[-1],(1,2,0)))
plt.show()

This might create a data loading bottleneck.
You could use the data_time meter from the ImageNet example to track the data loading time.

This post gives you some more information in case you haven’t read it already.

Thank you for your reply. I did already track the data loading time with time.time(), which already gave me the idea that this was indeed the problem. I will try to give some ideas on the purposed post a try.