Training "never finishes" or system crashes using PyTorch - GPU has memory allocated but always has 0% utilization using DataLoader

marco_c · January 24, 2023, 6:23pm

@ptrblck I believe that your time is scarce to keep answering me here, but I believe that the problem is that possibly in the environment that I was able to run the code without problems, the data loading was carried out using SSD and in this new environment it must be an HDD, the which should drastically affect the speed of code execution. I should also say that in a test I did today, I was able to perform the training, using only CPU, with 1 epoch and greatly reducing the number of samples, but even so it took a long time to complete the training (about 3 hours). I’ll pass you the script that executes the code related to the DataLoader, this is the script (“data_loader.py”) that I’m using in the last tests that I’m doing (when I tested it with a lot less samples I added the line “self.samples = self.samples[:5000]”).

import os
import pdb
import pickle
from os.path import basename, isfile, join

import h5py
import numpy as np
import pandas as pd
import pytz
import torch.utils.data as data
from numpy import arange, array, zeros
from scipy import signal
from torch import Tensor

from station import is_equinox, is_summer, is_winter
from scipy.signal import savgol_filter

TIMEZONE = pytz.timezone("UTC")

class SequenceLoader(data.Dataset):
    """Main Class for Image Folder loader."""
    def __init__(self,
                 name,
                 path_files,
                 seq_length_min,
                 step_min,
                 window_train,
                 window_predict,
                 exovariables=False,
                 data=None,
                 station='all',
                 ever=False,
                 smooth=False,
                 scaler=False):
        """Init function."""
                                                            
        self.name = name
        self.path_files = path_files
        self.seq_length_min = seq_length_min
        self.step_min = step_min
        self.seq_length = int(seq_length_min / self.step_min)
        self.window_train = window_train
        self.window_predict = window_predict
        self.exovariables = exovariables
        self.data = data
        self.step = int(step_min / 10)
        self.ever = ever

        # try load samples from file, if not generate samples from database
        self.df = pd.read_pickle(f'df_{name}.pkl')
        self.samples = pickle.load(open(f'samples_{name}.pkl', 'rb'))
        self.samples = self.samples[:5000]
        
        if station == 'all':
            pass
        elif station == 'summer':
            self.samples = [i for i in self.samples if is_summer(i)]
        elif station == 'winter':
            self.samples = [i for i in self.samples if is_winter(i)]
        elif station == 'equinox':
            self.samples = [i for i in self.samples if is_equinox(i)]
        else:
            raise Exception(f'Invalid station => {station}')

        self.input_channel = 1
        self.shape = (1, 7, 7)
        self.seq_shape = (self.seq_length, self.input_channel, self.shape[-2],
                          self.shape[-1])

        #self.hdf = h5py.File(f'file.h5', 'r')

    def get_index_time(self, index):
        index_start, index_end = self.samples[index]
        return index_start, index_end

    def load(self, index):
        while True:
            try:
                index_start, index_end = self.samples[index]
                df_aux = self.df[index_start:index_end]

                samples_index = df_aux.index

                sample = zeros(self.seq_shape)
                aux_array = None

                for idx, aux in enumerate(samples_index):
                    key = aux.strftime('%Y-%m-%d %H:%M')
                    aux_array = array(h5py.File(f'file.h5', 'r').get(key))
                    aux_array[:, :][aux_array[:, :] == -np.inf] = 0
                    aux_array[:, :][aux_array[:, :] < 0] = np.quantile(aux_array[:, :], 0.10)

                    aux_array[:, :] /= 250
                    sample[idx, 0, :, :] = aux_array

                # test for empty array
                if np.any(np.isnan(sample)):
                    index = np.random.choice(arange(0, self.__len__()))
                else:
                    break

            except KeyError as e:
                index = np.random.choice(np.arange(0, self.__len__()))
            except IndexError as e:
                index = np.random.choice(arange(0, self.__len__()))

        X = sample[0:self.window_train, :, :, :]
        y = sample[self.window_train:, :, :, :]

        X = Tensor(X.transpose(1, 0, 2, 3))
        #print("X.device: "+str(X.device))
        #print("\n\n")
        y = Tensor(y.transpose(1, 0, 2, 3))
        #print("y.device: "+str(y.device))
        #print("\n\n")

        return X, y
    def __getitem__(self, index):
        """Get item."""
        return self.load(index)

    def __len__(self):
        """Length."""
        return len(self.samples)

The commented line #self.hdf = h5py.file (f'file.h5 ',' r ') was originally decominate, but to solve the problem I mentioned in the answer" https://discuss.pytorch.org/ t/training-over-thin-system-crashes-using-pytorch-gppu-has-allocated-but-always-has-0-Utilization-using-dataloader/170801/5?u= mark_c", specifically performing some adjustments described here - “Data Loader does not work with Hdf5 file, when num_worker >1 · Issue #11929 · pytorch/pytorch · GitHub” - I made the appropriate changes, opening the file only in the function __getitem__. Can you help me please? Thanks in advance.