Hi all,
Firstly, I am a newbie with Pytorch, so please accept my apologies if I’m doing something wrong here. I’ve gone through other threads on similar issues, but it hasn’t helped yet.
I’ve been stuck on this problem for a while. I’m using pytorch to load a bunch of numpy arrays from numpy files stored in Google Drive and then use them to train a generator-discriminator model. There are 9K files (each file is 3MB each on the drive) and each array is 1025 * 431 float values. I am trying to run 20-40 epochs. I am using the GPU in Google colab for this. The first few epochs run all right, but after a while, it would get stuck in an epoch - Google colab doesn’t crash, the RAM isn’t exceeded, all signs point out that it’s running but even after a few hours, it wouldn’t make progress within the data-loader. Below are my data-loaders and the actual function. The odd thing is, everytime I kill the training cell and restart it, it runs for fewer epochs (the first time, it did 13, then 6, and then 2). What am I doing wrong?
# define custom dataset class
class GuitarSpectDataset(Dataset):
def __init__(self, normalized_music_array_dir):
# get image names of images which are chequered
self.file_names = os.listdir(normalized_music_array_dir)
self.file_names = [f for f in self.file_names if f!="max_value.npy"]
print(len(self.file_names))
# get image paths
self.file_paths = []
for file_name in self.file_names:
self.file_paths.append(os.path.join(normalized_music_array_dir, file_name))
def __getitem__(self, idx):
# read data
curr_array = np.load(self.file_paths[idx])
# apply data augmentation
transform = Compose([
ToTensor(),
])
#image = transform(image)
transformed_array = transform(curr_array)
return transformed_array
def __len__(self):
return len(self.file_names)
# create instance of custom dataset
dataset = GuitarSpectDataset(normalized_music_array_dir)
# create dataloader
data_loader = DataLoader(
dataset,
batch_size=16,
num_workers=4,
#shuffle=True #Set shuffle off to see if that was the problem - didnt help
)
# set model to train
discriminator.train()
generator.train()
# train model
for epoch in range(num_epochs):
# initialize variables
epochG_loss = epochD_loss = cnt = 0
# loop through the data
for i, batch_x in enumerate(data_loader):
print("epoch = ", epoch, "i = ", i, " entered train discriminator")
## train discriminator
# get real images and targets and transfer them to GPU
batch_x = batch_x.to("cuda").float()
batch_size = batch_x.shape[0]
batch_y = torch.full((batch_size,), real_label, device="cuda").float()
# clear gradients
discriminator.zero_grad()
# pass real images to discriminator model
outputs = discriminator(batch_x.view(batch_size, 1,batch_x.shape[2], batch_x.shape[3])).view(-1)
#print("output:", output.max(), output.shape)
# get loss
errD_real = criterion(outputs, batch_y)
# do a backward pass
errD_real.backward()
# get fake images and targets and transfer them to GPU
generated_images = generator(torch.randn((batch_size, 1,18,9), device="cuda"))
batch_y = torch.full((batch_size,), fake_label, device="cuda").float()
# pass fake images to discriminator model
outputs = discriminator(generated_images.detach()).view(-1)
#print("2nd output:", output.max(), output.shape)
# get loss
errD_fake = criterion(outputs, batch_y)
# do a backward pass
errD_fake.backward()
# update gradients
optimizerD.step()
## train generator
# clear gradients
generator.zero_grad()
# get real targets and transfer them to GPU
batch_y = torch.full((batch_size,), real_label, device="cuda").float()
# pass fake images to discriminator model
outputs = discriminator(generated_images).view(-1)
#print("3rd output:", output.max(), output.shape)
# get loss
errG = criterion(outputs, batch_y)
# do a backward pass
errG.backward()
# add loss
errD = errD_real + errD_fake
# update gradients
optimizerG.step()
# sum loss and get count
epochD_loss += errD.item()
epochG_loss += errG.item()
cnt += 1
print("epoch = ", epoch, "i = ", i, " one batch done")
# take average loss for all batches
epochD_loss /= cnt
epochG_loss /= cnt
save_checkpoint(generator, optimizerG, discriminator, optimizerD, epoch)
# print loss and generate image
if (epoch + 1) % 10 == 0:
print("Training loss for epoch {} is {:.5f} for disciminator and {:.5f} for generator".format(epoch + 1, epochD_loss, epochG_loss))
if (epoch + 1) % 50 == 0:
print('\nGenerated Image')
# get prediction
generator.eval()
with torch.no_grad():
output = generator(fixed_noise.to('cuda')).detach().cpu().numpy()
# plot generated image
plt.imshow(((output.reshape(1025, 431) + 1)/2) * max_value)
plt.show()
curr_array = ((output + 1)/2) * max_value
print(curr_array.shape, curr_array.max(), curr_array.min())
np.save("/content/drive/MyDrive/AI ML Projects/Music" + "/pytorch_CNN_" + str(epoch) + ".npy", curr_array)
# revert back to training
generator.train()