Training "never finishes" or system crashes using PyTorch - GPU has memory allocated but always has 0% utilization using DataLoader

@ptrblck I believe that your time is scarce to keep answering me here, but I believe that the problem is that possibly in the environment that I was able to run the code without problems, the data loading was carried out using SSD and in this new environment it must be an HDD, the which should drastically affect the speed of code execution. I should also say that in a test I did today, I was able to perform the training, using only CPU, with 1 epoch and greatly reducing the number of samples, but even so it took a long time to complete the training (about 3 hours). I’ll pass you the script that executes the code related to the DataLoader, this is the script (“data_loader.py”) that I’m using in the last tests that I’m doing (when I tested it with a lot less samples I added the line “self.samples = self.samples[:5000]”).

import os
import pdb
import pickle
from os.path import basename, isfile, join

import h5py
import numpy as np
import pandas as pd
import pytz
import torch.utils.data as data
from numpy import arange, array, zeros
from scipy import signal
from torch import Tensor

from station import is_equinox, is_summer, is_winter
from scipy.signal import savgol_filter

TIMEZONE = pytz.timezone("UTC")

class SequenceLoader(data.Dataset):
    """Main Class for Image Folder loader."""
    def __init__(self,
                 name,
                 path_files,
                 seq_length_min,
                 step_min,
                 window_train,
                 window_predict,
                 exovariables=False,
                 data=None,
                 station='all',
                 ever=False,
                 smooth=False,
                 scaler=False):
        """Init function."""
                                                            
        self.name = name
        self.path_files = path_files
        self.seq_length_min = seq_length_min
        self.step_min = step_min
        self.seq_length = int(seq_length_min / self.step_min)
        self.window_train = window_train
        self.window_predict = window_predict
        self.exovariables = exovariables
        self.data = data
        self.step = int(step_min / 10)
        self.ever = ever

        # try load samples from file, if not generate samples from database
        self.df = pd.read_pickle(f'df_{name}.pkl')
        self.samples = pickle.load(open(f'samples_{name}.pkl', 'rb'))
        self.samples = self.samples[:5000]
        
        if station == 'all':
            pass
        elif station == 'summer':
            self.samples = [i for i in self.samples if is_summer(i)]
        elif station == 'winter':
            self.samples = [i for i in self.samples if is_winter(i)]
        elif station == 'equinox':
            self.samples = [i for i in self.samples if is_equinox(i)]
        else:
            raise Exception(f'Invalid station => {station}')

        self.input_channel = 1
        self.shape = (1, 7, 7)
        self.seq_shape = (self.seq_length, self.input_channel, self.shape[-2],
                          self.shape[-1])

        #self.hdf = h5py.File(f'file.h5', 'r')

    def get_index_time(self, index):
        index_start, index_end = self.samples[index]
        return index_start, index_end

    def load(self, index):
        while True:
            try:
                index_start, index_end = self.samples[index]
                df_aux = self.df[index_start:index_end]

                samples_index = df_aux.index

                sample = zeros(self.seq_shape)
                aux_array = None

                for idx, aux in enumerate(samples_index):
                    key = aux.strftime('%Y-%m-%d %H:%M')
                    aux_array = array(h5py.File(f'file.h5', 'r').get(key))
                    aux_array[:, :][aux_array[:, :] == -np.inf] = 0
                    aux_array[:, :][aux_array[:, :] < 0] = np.quantile(aux_array[:, :], 0.10)

                    aux_array[:, :] /= 250
                    sample[idx, 0, :, :] = aux_array

                # test for empty array
                if np.any(np.isnan(sample)):
                    index = np.random.choice(arange(0, self.__len__()))
                else:
                    break

            except KeyError as e:
                index = np.random.choice(np.arange(0, self.__len__()))
            except IndexError as e:
                index = np.random.choice(arange(0, self.__len__()))

        X = sample[0:self.window_train, :, :, :]
        y = sample[self.window_train:, :, :, :]

        X = Tensor(X.transpose(1, 0, 2, 3))
        #print("X.device: "+str(X.device))
        #print("\n\n")
        y = Tensor(y.transpose(1, 0, 2, 3))
        #print("y.device: "+str(y.device))
        #print("\n\n")

        return X, y
    def __getitem__(self, index):
        """Get item."""
        return self.load(index)

    def __len__(self):
        """Length."""
        return len(self.samples)

The commented line #self.hdf = h5py.file (f'file.h5 ',' r ') was originally decominate, but to solve the problem I mentioned in the answer" https://discuss.pytorch.org/ t/training-over-thin-system-crashes-using-pytorch-gppu-has-allocated-but-always-has-0-Utilization-using-dataloader/170801/5?u= mark_c", specifically performing some adjustments described here - “Data Loader does not work with Hdf5 file, when num_worker >1 · Issue #11929 · pytorch/pytorch · GitHub” - I made the appropriate changes, opening the file only in the function __getitem__. Can you help me please? Thanks in advance.

Your concern about the slow data loading speed using an HDD is valid and you could check how long loading and processing a single batch would take in the current setup by removing the actual model training and just iterating the DataLoader for e.g. 10 batches.
This could give you an idea what performance to expect and maybe you are not running into a hang but just terribly slow data loading.

@ptrblck Hi, I took the tests. I iterated the training DataLoader for 10 batches, each of size 25, I used 8 as the number of workers in the non-problematic environment and in the problematic environment, this code snippet below ran in 4.6452929973602295 seconds in the non-problematic environment and in 260.850994348526 seconds in the problematic environment. Shuffle and pin_memory options = True on both. After doing several tests in the problematic environment, I managed to get a slightly better result using 12 as the number of workers (in the tests I did with a higher number of workers, it ended up harming), in this case the code executed in 152.2450408935547 seconds. See that in the worst case of the problematic environment the execution was about 56 times slower in relation to the problem-free environment, and in the best case of the problematic environment the execution was almost 33 times slower in relation to the problem-free environment.

Of course I could compare here the processor of the two machines, etc, but I think that kind of comparison would be irrelevant, what I can say is that several CPU cores are used in both environments at the time of this execution. I think that without a doubt we can hit the hammer and consider that this is the problem that is occurring. Something interesting that I also noticed is that most of the execution time (complete code) related to the training is due exactly to these data loading processes, which, from what I understand, occurs just before the execution of the first epoch training.

What do you recommend I do now to better understand the reason for this slowness and try to solve the problem?

And giving a little more detail about the problematic environment that has the HDD, etc., in short it is an HPC that, when allocate the job to a certain machine/certain node for code execution, it executes the program located on another machine/other node, that has an HDD. So, this execution is performed via the network, and the I/O is performed where the program is located (on an HDD).

Simplified code:

import time
import torch
import hydra

torch.cuda.is_available = lambda : False

@hydra.main(config_path="conf", config_name="config")
def main(cfg):
    
    if cfg.execution.experiment_id:
        experiment_id = cfg.execution.experiment_id
    else:
        experiment_id = str(uuid.uuid4())
        cfg.execution.experiment_id = experiment_id

  experiment_name = cfg.register.experiment_name

  from data_loader import SequenceLoader
  
  ds_train = SequenceLoader(name='train', **cfg.dataloader)

  batch_size = cfg.hparams.batch_size
  
  seq_train = torch.utils.data.DataLoader(
  ds_train,
  batch_size=batch_size, 
  shuffle=True,
  num_workers=8,
  pin_memory=True)

  ds_train.load(0)

  train_it = iter(seq_train)
  
  start_time = time.time()
      
  for i in range(10):
      my_seq_train_it = next(train_it)
      print("for index value " + str(i) + "\n\n")
      print("my_seq_train_it i lenght " + str(len(my_seq_train_it)) + "\n\n")
      print("my_seq_train_it[0] i lenght " + str(len(my_seq_train_it[0])) + "\n\n")
      print("my_seq_train_it[0][0] i lenght " + str(len(my_seq_train_it[0][0])) + "\n\n")
  
  print("--- %s seconds ---" % (time.time() - start_time) + "\n\n")    

if __name__ == "__main__":
    main()

Thank you very much again in advance!

Thanks for the verification!
Using a slow HDD as well as a network mount sound indeed like the bottleneck of your training pipeline and removing it would give you the largest benefit.
If possible I would recommend copying the data to a local SSD on your training node before starting the training to speed up each iteration. Otherwise your entire training would constantly run into this bottleneck and might even make training on this machine infeasible.

Hi, @ptrblck. News in a nutshell: I tested running directly on the training node, which I found to have a SSD (putting data + program codes there), but the performance still sucked. I was quite surprised, as everything indicated that the problem was due to the fact that access to the data was via the use of the HDD via the network, and I believe that the most aggravating fact was the disk issue. Upon further investigation, I discovered via the terminal that at least on two nodes the SSD used is a Micron_M510_MTFD model - 119.2 GB, I searched the internet for a benchmark and found this one - UserBenchmark: Crucial MTFDBAK128MAG-1G1 128GB (if it really is the exact model, I believe that the fact that it indicates a little less than 128 GB in the linux terminal (128 GB is indicated according to the model information present in this benchmark) is normal), it may be that the model is a little different but it certainly shouldn’t be basically change almost nothing. Summary: that SSD is very bad (correct me if I’m wrong), especially regarding sequential data reading, which must be the case when using DataLoader in my problem. I am now investigating which SSD model is used in the environment that runs the code quickly, I have already sent an email to the support of the environment and I am waiting for a response about this, so far they have replied that the storage is given by a pool of SSDs, etc, but knowing which are the most used would do. Thus, it would be possible to compare the SSDs in the non-problematic environment with the SSD in the problematic environment, which will probably make it possible to hit the hammer that even though it is an SSD, because it is very bad, it basically behaves like an HDD. Do you agree with these assumptions of mine? Well, considering that this is confirmed, I believe that the solution would have to come from optimizing the codes that use the DataLoader, right? As it stands, do you think there are optimization issues? Could it be significantly improved to the point of taking a performance leap and eliminating this bottleneck? Thank you very much!

That’s an interesting description and I would definitely try to verify this claim by running a few benchmarks on this system using hdparam to profile the actual SSD.
This could give you a good baseline which you could use to check if the SSD is indeed the bottleneck.

Hi, @ptrblck. I solved the problem of slow data loading, basically probably due to using a newer version of PyTorch or h5py (I haven’t confirmed it yet), the error that occurred as I described here - Training "never finishes" or system crashes using PyTorch - GPU has memory allocated but always has 0% utilization using DataLoader - #21 by marco_c - regarding opening the file in data_loader.py init function, it doesn’t happen anymore. Loading 10 batches, according to the tests I was doing, takes about 9 seconds now, although it still takes a little longer than the environment I still run without any problems, which takes a little about 4 seconds.

But now I’m back to the problem I’ve had since the beginning, which is about having 0% GPU utilization. Since the code is adapted to use GPU (which is not working in this environment) I can’t even use more CPU cores for some reason. Only when I adapt it to the CPU. Well, I just finished a test and I managed to run it successfully using only the CPU, but I would need to be able to use the GPU (remembering that it can load a certain amount of data in its memory, but it has 0% utilization).

Can you help me please?

I would recommend testing the training on the GPU only by removing the data loading as described previously:

Using static inputs to train the model could get rid of the data loading bottleneck (in case it’s still the slowest part of the code) and should show some GPU utilization in nvidia-smi during the forward and backward passes.

@ptrblck First thanks for the quick answer! Okay, now I remembered you suggested this test to me, sorry. But would this test basically be using fake X and y data just to do some lighter testing and be able to test the GPU?

Yes, exactly. It would be an easy test to verify your GPU is running by removing other parts (unrelated to the GPU usage) of the code.
Alternatively, you could of course profile the entire training via e.g. the native PyTorch profiler or Nsight Systems and debug it.

@ptrblck, I couldn’t do these tests using fake data because that would give me a certain amount of work since I would need to use the same format as my real data. But I’m doing tests using the GPU, when the data starts to load, I can now make more than one CPU core work (number of cores that I configure according to the chosen number of workers), and the slowness is due exactly to the data loading part, apparently using number of workers between 4 and 6 (hence using the chosen number to configure the number of CPU cores) is giving me the best results, but it’s still quite slow. And oh, I see the volatile GPU utilization go up to numbers well above 0% sometimes. Using CPU I had achieved the best results using 12 workers. What can I do to improve this performance please?

So far everything you’ve described points to a data loading bottleneck (including the GPU util. peaks when data is available). For a general advice on fixing data loading bottlenecks look at this post.

1 Like

@ptrblck, perfect, thank you, I already took a look at the link you passed, but there is a lot of information and everything left me confused. Now that I’ve done several tests, everything is indicating that the problem is because, for some reason, even when setting the number of workers > 0 I can’t make more than one CPU core work, but this only happens when my code is using GPU , when I adapt my code to use only CPU I manage to use more than one CPU core (I already commented about this before, but everything was foggy, now things are clearer - other causes of problem have been ruled out). Using GPU V100 I can get a less worst performance but only using workers =0 but I can’t get a better performance when loading data because I can’t use more than one CPU core. Do you have any idea what I can do to resolve this? Thank you so much again.

@ptrblck Should I use a CUDA version as compatible as possible with the V100 to have maximum performance? I’m using the 11.6 version. After doing some research this seems to be a possibility.

I don’t understand what this would mean as all CUDA runtimes we are supporting are compatible with the V100.

Could you explain how you’ve confirmed this, please?

Hi, @ptrblck!

Ok, thanks, I just wanted to confirm, as V100 is quite old, it could be the case that I need to use an older version of CUDA. But I managed to find an NVIDIA website (CUDA Compatibility :: NVIDIA Data Center GPU Driver Documentation) - see last table - which shows, as I understand it, that the current versions of CUDA are compatible with recent versions of CUDA (according to column information “CTK Support”, right?), although I just didn’t understand what the column “Compute Capability” means, which says that in relation to Volta models, it supports versions 7.x (correct me if I’m wrong please, I probably got it wrong).

Checking CPU cores utilization via htop.

I used a simple code in a recent source build and nightly binary using:

import torch
from torch.utils.data import TensorDataset, DataLoader


dataset = TensorDataset(torch.randn(10000, 1))
loader = DataLoader(dataset, num_workers=4)

for epoch in range(100):
    for data in loader:
        pass
    print("epoch {} done".format(epoch))

and see that multiple CPU cores are used if num_workers>0.
Since you are apparently unable to use multiple cores in Python, I would recommend trying to narrow down this issue first.

Ok, but that’s exactly what I’m trying to figure out, why this is happening and how can I fix it. As I said, in another environment I can use the same code and use more than 1 CPU core, in addition to using GPU, configuring the number of workers > 0. How could I check what is happening? I’m in the dark.

Well, maybe I wasn’t able to interpret your message, I’m sorry, I think you wanted me to test with your simple code to verify that I still couldn’t use more than one CPU core using number of workers > 0. Well, no I was able to adapt your code so that the data is transferred to the CUDA, so I used another relatively simple code to test and I still can’t use more than one CPU core. Code used:

    import torch.nn as nn
    import torch.optim as optim
    import torchvision
    import torchvision.transforms as transforms
    import torch.nn.functional as F


    dataset = torchvision.datasets.CIFAR10(root='.',
                                        download=True,
                                        transform=transforms.ToTensor())


    data_loader = torch.utils.data.DataLoader(dataset,
                                          batch_size=25,
                                          shuffle=True,
                                          num_workers=4)


    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 5)
            self.fc1 = nn.Linear(16 * 5 * 5, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, 10)

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = x.view(-1, 16 * 5 * 5)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    model = Net()

    
    model = model.cuda()

    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

   
    for epoch in range(100):
        running_loss = 0.0
        for i, data in enumerate(data_loader, 0):
            inputs, labels = data
            inputs, labels = inputs.cuda(), labels.cuda()

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step
        print(epoch)

I still cannot reproduce the issue using your code and can use multiple cores depending on the specified num_workers. I have never heard about the issue you are seeing before and thus don’t know what could disallow multiple processes to use different CPU cores.