Hi!
My custom DataLoader seems to crash after quite some iterations. My training data consists of ~60 000 lines of information about songs.
Here is my DataLoader:
import logging
import random
from typing import Tuple
import torch
from torch.utils.data import Dataset
import pickle
import os
import constants as c # Custom constants class
log = logging.getLogger(__name__)
def normalize(spec0: torch.Tensor, spec1: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
spec0 = torch.div(spec0 - c.mean_std[0], c.mean_std[1])
spec1 = torch.div(spec1 - c.mean_std[0], c.mean_std[1])
return spec0, spec1
class CustomDataset(Dataset):
def __init__(self, data):
super().__init__()
self.data = data
self.ids = list(data.id.array)
self.duration = 216
self.overlap = int(self.duration / 2)
def __getitem__(self, index):
song0 = self.data.iloc[index]
song0_id = song0.id
# print("song0_id: {}".format(song0_id))
# Only gather all songs with the same genre/genres if we are using Contrastive Loss + Genre or our new loss
# function
if c.use_genre:
genres0 = song0.genre.split(";")
# Using a set to avoid duplicates
genres0_songs = set()
# Gather all songs with the same genre/genres as song0
for genre in genres0:
songs = c.analyzed_genres.at[genre, 'songs'].split(';')
genres0_songs.update(songs)
# Remove song0 from genres0_songs, because we don't want the same song again
genres0_songs.discard(song0_id)
if len(genres0_songs) == 0: # Check if genres0_songs contains at least one song
raise AttributeError('genres0_songs contains no songs!')
else:
genres0_songs = None # Just to keep PyCharm happy
if c.loss_fn == 'contrastive':
# If we are using the standard Contrastive Loss
label = random.randint(0, 1)
if c.clazz == 'song':
# If we are using same song as class
if label == 1:
# If it should be the same song
song1_id = song0_id
else:
# If it should be a different song
# Choose an random index that's not equal to the given index
song1_id = random.choice(self.ids)
while song1_id == song0_id:
song1_id = random.choice(self.ids)
else:
if label == 1:
# If it's the same genre
song1_id = random.choice(tuple(genres0_songs))
else:
# If not
song1_id = random.choice(self.ids)
while (song1_id == song0_id) | (song1_id in genres0_songs):
song1_id = random.choice(self.ids)
else:
label = random.randint(0, 2)
if label == 0:
# If it's the same song
song1_id = song0_id
else:
if label == 1:
# If it's the same genre
song1_id = random.choice(tuple(genres0_songs))
else:
# If it's an other song
song1_id = random.choice(self.ids)
while (song1_id == song0_id) | (song1_id in genres0_songs):
song1_id = random.choice(self.ids)
# log.info('Label: {} Song_0: {} Song_1: {}'.format(label, song0_id, song1_id))
spec0, spec1 = self.get_specs(song0_id, song1_id, self.duration) # mel_spectrograms computed by librosa and normalized
# log.info('Specs for previous songs loaded...')
return spec0, spec1, torch.tensor(data=label)
def get_specs(self, song0: str, song1: str, duration: int) -> Tuple[torch.Tensor, torch.Tensor]:
if song0 is None:
raise AttributeError('song0 id cannot be None!')
if song1 is None:
raise AttributeError('song1 id cannot be None!')
if duration is None:
raise AttributeError('duration cannot be None!')
# spec0 = c.loaded_specs[song0]
spec0 = pickle.load(open(os.path.join(c.analyzed_songs_path, song0), 'rb'))
spec0_x = spec0.shape[1]
offset0 = random.randint(0, spec0_x - duration)
# print("offset0: {}".format(offset0))
if song0 == song1:
spec1 = spec0
offset1 = random.randint(0, spec0_x - duration)
while abs(offset0 - offset1) < self.overlap:
offset1 = random.randint(0, spec0_x - duration)
# print("offset1: {}".format(offset1))
else:
# spec1 = c.loaded_specs[song1]
spec1 = pickle.load(open(os.path.join(c.analyzed_songs_path, song1), 'rb'))
offset1 = random.randint(0, spec1.shape[1] - duration)
# print("offset1: {}".format(offset1))
spec0 = torch.tensor(data=spec0[:, offset0:offset0 + duration])
spec1 = torch.tensor(data=spec1[:, offset1:offset1 + duration])
spec0, spec1 = normalize(spec0, spec1)
return spec0, spec1
def __len__(self):
return len(self.data.index)
Here’s my training loop:
def train_model(nn, loss_fn, path, train_data, validation_data):
# For how many epochs should the network be trained?
log.info('Training for {} epochs'.format(c.epochs))
tmp = []
nn = nn.double()
optimizer = optim.Adam(nn.parameters(), lr=0.0005)
for epoch in range(c.epochs):
log.info('========== TRAIN EPOCH {} =========='.format(epoch))
print("Epoch: {}".format(epoch))
nn.train()
for i, data in enumerate(train_data):
specs0, specs1, labels = data
print("Got accumulated data")
spec0 = specs0.to(c.device)
spec1 = specs1.to(c.device)
label = labels.to(c.device)
if (epoch == 0) & (i == 0):
log.info('spec0.device = {}'.format(spec0.device))
log.info('spec1.device = {}'.format(spec1.device))
log.info('label.device = {}'.format(label.device))
optimizer.zero_grad()
output0, output1 = nn(spec0, spec1)
loss = loss_fn(output0, output1, label)
loss.backward()
optimizer.step()
tmp.append(loss.item())
# Save loss history
print("Finished training for epoch {}".format(epoch))
pickle.dump(tmp, open(os.path.join(path, 'train_loss_history_epoch_{}'.format(epoch)), 'wb'))
# Calculate average loss for the last epoch
avg = np.mean(tmp)
tmp.clear()
log.info('Epoch: {}; Average loss: {}'.format(epoch, avg))
torch.save(nn.state_dict(), os.path.join(path, 'state_epoch_{}'.format(epoch)))
with torch.no_grad():
log.info('========== VALIDATE EPOCH {} =========='.format(epoch))
nn.eval()
for i, data in enumerate(validation_data):
specs0, specs1, labels = data
spec0 = specs0.to(c.device)
spec1 = specs1.to(c.device)
label = labels.to(c.device)
if (epoch == 0) & (i == 0):
log.info('spec0.device = {}'.format(spec0.device))
log.info('spec1.device = {}'.format(spec1.device))
log.info('label.device = {}'.format(label.device))
output0, output1 = nn(spec0, spec1)
loss = loss_fn(output0, output1, label)
tmp.append(loss.item())
# Save loss history
pickle.dump(tmp, open(os.path.join(path, 'validation_loss_history_epoch_{}'.format(epoch)), 'wb'))
# Calculate the average loss
avg = np.mean(tmp)
tmp.clear()
log.info('Epoch: {}; Average loss: {}'.format(epoch, avg))
I have noticed the following:
In the beginning, everything works as expected. The GPU utilization is at 100% and I have enough memory on both GPU and in my system. But after some time, my DataLoader with 4 workers disappears from top
, training stops and the GPU utilization becomes 0%. Changing the batch_size
and num_workers
does not help. Unfortunately, I do not get any errors or any output that helps me find the problem.
Let me know what additional information you need. I’m not quite sure what information to add.
Thank you in advance!