DataLoader seems to crash

boto · April 9, 2020, 4:18am

Hi!

My custom DataLoader seems to crash after quite some iterations. My training data consists of ~60 000 lines of information about songs.
Here is my DataLoader:

import logging
import random
from typing import Tuple

import torch
from torch.utils.data import Dataset
import pickle
import os

import constants as c # Custom constants class

log = logging.getLogger(__name__)


def normalize(spec0: torch.Tensor, spec1: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    spec0 = torch.div(spec0 - c.mean_std[0], c.mean_std[1])
    spec1 = torch.div(spec1 - c.mean_std[0], c.mean_std[1])
    return spec0, spec1


class CustomDataset(Dataset):

    def __init__(self, data):
        super().__init__()
        self.data = data
        self.ids = list(data.id.array)
        self.duration = 216
        self.overlap = int(self.duration / 2)

    def __getitem__(self, index):
        song0 = self.data.iloc[index]
        song0_id = song0.id
        # print("song0_id: {}".format(song0_id))
        # Only gather all songs with the same genre/genres if we are using Contrastive Loss + Genre or our new loss
        # function
        if c.use_genre:
            genres0 = song0.genre.split(";")
            # Using a set to avoid duplicates
            genres0_songs = set()
            # Gather all songs with the same genre/genres as song0
            for genre in genres0:
                songs = c.analyzed_genres.at[genre, 'songs'].split(';')
                genres0_songs.update(songs)
            # Remove song0 from genres0_songs, because we don't want the same song again
            genres0_songs.discard(song0_id)
            if len(genres0_songs) == 0:  # Check if genres0_songs contains at least one song
                raise AttributeError('genres0_songs contains no songs!')
        else:
            genres0_songs = None  # Just to keep PyCharm happy

        if c.loss_fn == 'contrastive':
            # If we are using the standard Contrastive Loss
            label = random.randint(0, 1)
            if c.clazz == 'song':
                # If we are using same song as class
                if label == 1:
                    # If it should be the same song
                    song1_id = song0_id
                else:
                    # If it should be a different song
                    # Choose an random index that's not equal to the given index
                    song1_id = random.choice(self.ids)
                    while song1_id == song0_id:
                        song1_id = random.choice(self.ids)
            else:
                if label == 1:
                    # If it's the same genre
                    song1_id = random.choice(tuple(genres0_songs))
                else:
                    # If not
                    song1_id = random.choice(self.ids)
                    while (song1_id == song0_id) | (song1_id in genres0_songs):
                        song1_id = random.choice(self.ids)
        else:
            label = random.randint(0, 2)

            if label == 0:
                # If it's the same song
                song1_id = song0_id
            else:
                if label == 1:
                    # If it's the same genre
                    song1_id = random.choice(tuple(genres0_songs))
                else:
                    # If it's an other song
                    song1_id = random.choice(self.ids)
                    while (song1_id == song0_id) | (song1_id in genres0_songs):
                        song1_id = random.choice(self.ids)

        # log.info('Label: {}    Song_0: {}     Song_1: {}'.format(label, song0_id, song1_id))
        spec0, spec1 = self.get_specs(song0_id, song1_id, self.duration) # mel_spectrograms computed by librosa and normalized
        # log.info('Specs for previous songs loaded...')

        return spec0, spec1, torch.tensor(data=label)

    def get_specs(self, song0: str, song1: str, duration: int) -> Tuple[torch.Tensor, torch.Tensor]:
        if song0 is None:
            raise AttributeError('song0 id cannot be None!')
        if song1 is None:
            raise AttributeError('song1 id cannot be None!')
        if duration is None:
            raise AttributeError('duration cannot be None!')
        # spec0 = c.loaded_specs[song0]
        spec0 = pickle.load(open(os.path.join(c.analyzed_songs_path, song0), 'rb'))
        spec0_x = spec0.shape[1]
        offset0 = random.randint(0, spec0_x - duration)
        # print("offset0: {}".format(offset0))
        if song0 == song1:
            spec1 = spec0
            offset1 = random.randint(0, spec0_x - duration)
            while abs(offset0 - offset1) < self.overlap:
                offset1 = random.randint(0, spec0_x - duration)
            # print("offset1: {}".format(offset1))
        else:
            # spec1 = c.loaded_specs[song1]
            spec1 = pickle.load(open(os.path.join(c.analyzed_songs_path, song1), 'rb'))
            offset1 = random.randint(0, spec1.shape[1] - duration)
            # print("offset1: {}".format(offset1))
        spec0 = torch.tensor(data=spec0[:, offset0:offset0 + duration])
        spec1 = torch.tensor(data=spec1[:, offset1:offset1 + duration])

        spec0, spec1 = normalize(spec0, spec1)

        return spec0, spec1

    def __len__(self):
        return len(self.data.index)

Here’s my training loop:

def train_model(nn, loss_fn, path, train_data, validation_data):
    # For how many epochs should the network be trained?
    log.info('Training for {} epochs'.format(c.epochs))
    tmp = []
    nn = nn.double()
    optimizer = optim.Adam(nn.parameters(), lr=0.0005)
    for epoch in range(c.epochs):
        log.info('========== TRAIN EPOCH {} =========='.format(epoch))
        print("Epoch: {}".format(epoch))
        nn.train()
        for i, data in enumerate(train_data):
            specs0, specs1, labels = data
            print("Got accumulated data")
            spec0 = specs0.to(c.device)
            spec1 = specs1.to(c.device)
            label = labels.to(c.device)
            if (epoch == 0) & (i == 0):
                log.info('spec0.device = {}'.format(spec0.device))
                log.info('spec1.device = {}'.format(spec1.device))
                log.info('label.device = {}'.format(label.device))
            optimizer.zero_grad()
            output0, output1 = nn(spec0, spec1)
            loss = loss_fn(output0, output1, label)
            loss.backward()
            optimizer.step()
            tmp.append(loss.item())
        # Save loss history
        print("Finished training for epoch {}".format(epoch))
        pickle.dump(tmp, open(os.path.join(path, 'train_loss_history_epoch_{}'.format(epoch)), 'wb'))
        # Calculate average loss for the last epoch
        avg = np.mean(tmp)
        tmp.clear()
        log.info('Epoch: {}; Average loss: {}'.format(epoch, avg))
        torch.save(nn.state_dict(), os.path.join(path, 'state_epoch_{}'.format(epoch)))
        with torch.no_grad():
            log.info('========== VALIDATE EPOCH {} =========='.format(epoch))
            nn.eval()
            for i, data in enumerate(validation_data):
                specs0, specs1, labels = data
                spec0 = specs0.to(c.device)
                spec1 = specs1.to(c.device)
                label = labels.to(c.device)
                if (epoch == 0) & (i == 0):
                    log.info('spec0.device = {}'.format(spec0.device))
                    log.info('spec1.device = {}'.format(spec1.device))
                    log.info('label.device = {}'.format(label.device))
                output0, output1 = nn(spec0, spec1)
                loss = loss_fn(output0, output1, label)
                tmp.append(loss.item())
            # Save loss history
            pickle.dump(tmp, open(os.path.join(path, 'validation_loss_history_epoch_{}'.format(epoch)), 'wb'))
            # Calculate the average loss
            avg = np.mean(tmp)
            tmp.clear()
            log.info('Epoch: {}; Average loss: {}'.format(epoch, avg))

I have noticed the following:
In the beginning, everything works as expected. The GPU utilization is at 100% and I have enough memory on both GPU and in my system. But after some time, my DataLoader with 4 workers disappears from top, training stops and the GPU utilization becomes 0%. Changing the batch_size and num_workers does not help. Unfortunately, I do not get any errors or any output that helps me find the problem.

Let me know what additional information you need. I’m not quite sure what information to add.

Thank you in advance!

ptrblck · April 9, 2020, 6:14am

Do you see any stack trace, if you kill the process via CTRL+C?
If so, could you please post it here?

boto · April 9, 2020, 12:57pm

Unfortunately, only the Stacktrace ending in KeyboardInterrupt.

Traceback (most recent call last):
  File "main.py", line 143, in <module>
    main()
  File "main.py", line 29, in main
    train_model(nn, loss_fn, path, train_data, validation_data)
  File "main.py", line 76, in train_model
    for i, data in enumerate(train_data):
  File "/home/todor/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 345, in __next__
    data = self._next_data()
  File "/home/todor/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 841, in _next_data
    idx, data = self._get_data()
  File "/home/todor/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 808, in _get_data
    success, data = self._try_get_data()
  File "/home/todor/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 761, in _try_get_data
    data = self._data_queue.get(timeout=timeout)
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 107, in get
    if not self._poll(timeout):
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 424, in _poll
    r = wait([self], timeout)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 930, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.8/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt

ptrblck · April 9, 2020, 7:42pm

That’s still useful and it seems the DataLoader hangs.
Could you update to the nightly binary and rerun your code please?
If necessary, create a new virtual environment.

boto · April 10, 2020, 10:41am

I’ve uninstalled PyTorch 1.4 stable and installed the nightly version. print(torch.__version__) returns 1.6.0.dev20200409+cu101. The result stays the same…

boto · April 14, 2020, 2:43am

I’ve tried to debug my code but I didn’t find anything suspicious.
@ptrblck Do you have any other ideas?

boto · April 14, 2020, 5:40pm

Okay, I’ve found my error. There is nothing wrong with the DataLoader but with get_specs() function. The following lines were the troublemakers:

...
if song0 == song1:
   spec1 = spec0
   offset1 = random.randint(0, spec0_x - duration)
   while abs(offset0 - offset1) < self.overlap:
         offset1 = random.randint(0, spec0_x - duration)
...

The reason is that it can take forever for the code to find a suitable random number. To avoid this one can define a range, in which the random generate should not look. I’ve done it the following way:

...
if song0 == song1:
   spec1 = spec0
   overlap_range = range(max(0, offset0 - self.overlap),
                         min(spec0_x - self.duration, offset0 + self.overlap))
   offset1_range = [i for i in range(0, spec0_x) if
                   (i not in overlap_range) & (i <= spec0_x - self.duration)]
   offset1 = random.choice(offset1_range)
...

daixiangzi · May 28, 2020, 1:48am

meet same problem;after training a epoch and val, train process is stuck, after crtl+c ,dataload is ```
self._selector.poll(timeout)