CPU running out of memory with DataLoader

Ian_Kennedy · August 2, 2022, 1:40pm

When calling my DataLoader I am getting a memory error. Here is the DataLoader and the code that calls it.

 class HRADataset(Dataset):
    
        """HRA dataset."""
    
        def __init__(self, csv_file, image_dir, transform=None, target_transforms=None):
            """
            Args:
                csv_file (string): Path to the csv file with annotations.
                root_dir (string): Directory with all the images.
                transform (callable, optional): Optional transform to be applied
                    on a sample.
            """
            self.labels = pd.read_csv(csv_file)
            self.image_dir = image_dir
            self.transform = transform
            self.target_transforms = target_transforms
    
        def __len__(self):
            return len(self.labels)
    
        def __getitem__(self, idx):
            if torch.is_tensor(idx):
                idx = idx.tolist()
    
            img_name = os.path.join(self.image_dir,
                                    self.labels.iloc[idx, 0])
            image = imageio.imread(img_name)
            
            label = self.labels.iloc[idx, 1]
            if self.transform:
                image = self.transform(image)
            
            if self.target_transforms:
                label = self.target_transforms(label)
    
            #label = self.labels.iloc[idx, 1]
            sample = {'image': image, 'label': label}
    
            return sample

Code:

training_generator = DataLoader(train_data, batch_size=16,
                              shuffle=False, num_workers=0)

def train():
    start_time = time.time()
    my_nn.train()
    my_nn.apply(init_weights)

    global best_result

    best_result = 0
    total_loss = 0.
    log_interval = 1
    correct = 0.
    mode = 'loss'

    for epoch in range(max_epochs):
        for i, local_sample in enumerate(training_generator):
            batch_sample= local_sample['image'].to(device)
            label_sample = local_sample['label'].to(device)

            optimizer.zero_grad()
            pred = my_nn(batch_sample)

            _, predicted = torch.max(pred.data, 1)

            correct += (label_sample==predicted).sum().item()
            loss = criterion(pred, label_sample)
            
            loss.backward()
            
            nn.utils.clip_grad_norm_(my_nn.parameters(), 0.5)
            optimizer.step()
            total_loss += loss.item()
            
            if i%50 == 0:
                print(i)
            
            if epoch%log_interval == 0 and i == n_batches-1:
                cur_loss = total_loss/(log_interval)
                elapsed = time.time() - start_time
                print('Train: | epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                      'loss {:5.5f} | correct'.format(
                    epoch, i+1, n_batches, elapsed * 1000,
                    math.sqrt(cur_loss), correct))

                start_time = time.time()
                total_loss = 0.
                correct = 0.

train()

Traceback:

Input In [11], in train()
     27 mode = 'loss'
     29 for epoch in range(max_epochs):
---> 30     for i, local_sample in enumerate(training_generator):
     31         batch_sample= local_sample['image'].to(device)
     32         label_sample = local_sample['label'].to(device)

....

RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 4320000 bytes.

JamesDickens · August 2, 2022, 2:12pm

How big is the CSV file

Ian_Kennedy · August 2, 2022, 3:15pm

Hey James, if is only 2kb, the image folder is 300+ kb but that should not matter.

ptrblck · August 2, 2022, 4:44pm

It seems your system runs out of memory while trying to allocate ~4MB. Could you check your system resources to see if your host RAM is almost at its limit before you launch the process?