@ptrblck I believe that your time is scarce to keep answering me here, but I believe that the problem is that possibly in the environment that I was able to run the code without problems, the data loading was carried out using SSD and in this new environment it must be an HDD, the which should drastically affect the speed of code execution. I should also say that in a test I did today, I was able to perform the training, using only CPU, with 1 epoch and greatly reducing the number of samples, but even so it took a long time to complete the training (about 3 hours). I’ll pass you the script that executes the code related to the DataLoader, this is the script (“data_loader.py”) that I’m using in the last tests that I’m doing (when I tested it with a lot less samples I added the line “self.samples = self.samples[:5000]”).
import os
import pdb
import pickle
from os.path import basename, isfile, join
import h5py
import numpy as np
import pandas as pd
import pytz
import torch.utils.data as data
from numpy import arange, array, zeros
from scipy import signal
from torch import Tensor
from station import is_equinox, is_summer, is_winter
from scipy.signal import savgol_filter
TIMEZONE = pytz.timezone("UTC")
class SequenceLoader(data.Dataset):
"""Main Class for Image Folder loader."""
def __init__(self,
name,
path_files,
seq_length_min,
step_min,
window_train,
window_predict,
exovariables=False,
data=None,
station='all',
ever=False,
smooth=False,
scaler=False):
"""Init function."""
self.name = name
self.path_files = path_files
self.seq_length_min = seq_length_min
self.step_min = step_min
self.seq_length = int(seq_length_min / self.step_min)
self.window_train = window_train
self.window_predict = window_predict
self.exovariables = exovariables
self.data = data
self.step = int(step_min / 10)
self.ever = ever
# try load samples from file, if not generate samples from database
self.df = pd.read_pickle(f'df_{name}.pkl')
self.samples = pickle.load(open(f'samples_{name}.pkl', 'rb'))
self.samples = self.samples[:5000]
if station == 'all':
pass
elif station == 'summer':
self.samples = [i for i in self.samples if is_summer(i)]
elif station == 'winter':
self.samples = [i for i in self.samples if is_winter(i)]
elif station == 'equinox':
self.samples = [i for i in self.samples if is_equinox(i)]
else:
raise Exception(f'Invalid station => {station}')
self.input_channel = 1
self.shape = (1, 7, 7)
self.seq_shape = (self.seq_length, self.input_channel, self.shape[-2],
self.shape[-1])
#self.hdf = h5py.File(f'file.h5', 'r')
def get_index_time(self, index):
index_start, index_end = self.samples[index]
return index_start, index_end
def load(self, index):
while True:
try:
index_start, index_end = self.samples[index]
df_aux = self.df[index_start:index_end]
samples_index = df_aux.index
sample = zeros(self.seq_shape)
aux_array = None
for idx, aux in enumerate(samples_index):
key = aux.strftime('%Y-%m-%d %H:%M')
aux_array = array(h5py.File(f'file.h5', 'r').get(key))
aux_array[:, :][aux_array[:, :] == -np.inf] = 0
aux_array[:, :][aux_array[:, :] < 0] = np.quantile(aux_array[:, :], 0.10)
aux_array[:, :] /= 250
sample[idx, 0, :, :] = aux_array
# test for empty array
if np.any(np.isnan(sample)):
index = np.random.choice(arange(0, self.__len__()))
else:
break
except KeyError as e:
index = np.random.choice(np.arange(0, self.__len__()))
except IndexError as e:
index = np.random.choice(arange(0, self.__len__()))
X = sample[0:self.window_train, :, :, :]
y = sample[self.window_train:, :, :, :]
X = Tensor(X.transpose(1, 0, 2, 3))
#print("X.device: "+str(X.device))
#print("\n\n")
y = Tensor(y.transpose(1, 0, 2, 3))
#print("y.device: "+str(y.device))
#print("\n\n")
return X, y
def __getitem__(self, index):
"""Get item."""
return self.load(index)
def __len__(self):
"""Length."""
return len(self.samples)
The commented line #self.hdf = h5py.file (f'file.h5 ',' r ')
was originally decominate, but to solve the problem I mentioned in the answer" https://discuss.pytorch.org/ t/training-over-thin-system-crashes-using-pytorch-gppu-has-allocated-but-always-has-0-Utilization-using-dataloader/170801/5?u= mark_c", specifically performing some adjustments described here - “Data Loader does not work with Hdf5 file, when num_worker >1 · Issue #11929 · pytorch/pytorch · GitHub” - I made the appropriate changes, opening the file only in the function __getitem__
. Can you help me please? Thanks in advance.