DataLoader Images are all stored into the RAM

I am trying to train a CNN using images and instead of dividing the images into different folders for train and validation sets, I defined my own Dataset subclass as follows:

class TrainDataset(Dataset):
    def __init__(self, root, transform=None, labels_df=None, image_filenames = [], 
                 augmentation_transform=None, score_th = 7.0):
        
        assert labels_df is not None, "please provide the ground truth"
        assert len(image_filenames) > 0, 'you must provide the list of filenames'
        
        self.root = root
        self.transform = transform
        self.labels_df = labels_df
        self.image_filenames = image_filenames
        
        self.augmentation_transform  = augmentation_transform
        self.score_th = score_th

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, index):
        # load the image
        image_path = os.path.join(self.root, self.image_filenames[index])
        image_num = int(self.image_filenames[index].split(".")[0])

        image = Image.open(image_path).convert('RGB')
        # apply the transform
        if self.transform:
            # Transformations on all the images
            image = self.transform(image)

        # load the score for the image from the DataFrame
        row = self.labels_df[self.labels_df['image_num'] == image_num]
        labels = torch.tensor(row.iloc[:,1:].values, dtype=torch.float32)  
        labels = labels.permute(1,0).to(device)

        score = compute_score(labels)

        if score >= self.score_th and self.augmentation_transform:
          image = self.augmentation_transform(image)


        return image.to(device), labels
class ValDataset(Dataset):
    def __init__(self, root, transform=None, labels_df=None, images_filenames = []):
        
        assert labels_df is not None, "please provide the ground truth"
        assert len(images_filenames) > 0, 'you must provide the list of filenames'
        
        self.root = root
        self.transform = transform
        self.labels_df = labels_df
        self.image_filenames = images_filenames

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, index):
        # load the image
        image_path = os.path.join(self.root, self.image_filenames[index])
        image_num = int(self.image_filenames[index].split(".")[0])

        image = Image.open(image_path).convert('RGB')
        # apply the transform
        if self.transform:
            image = self.transform(image)
        
        row = self.labels_df[self.labels_df['image_num'] == image_num]
        labels = torch.tensor(row.iloc[:,1:].values, dtype=torch.float32)  
        labels = labels.permute(1,0).to(device)

        
        return image.to(device), labels

Then when creating the DataLoaders object the code is the following:

val_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
val_data = ValDataset(images_dir, transform=val_transform, labels_df = labels_df, images_filenames = val_filenames)
val_dl = DataLoader(val_data, batch_size=64, shuffle=False)

# Define the transforms to be applied to the image
train_transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.RandomCrop((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

augmentation_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(0.7)
])

# create the custom dataset using the images folder and the labels DataFrame
train_data = TrainDataset(images_dir, transform=train_transform, 
                          labels_df= labels_df, image_filenames = train_filenames, 
                          augmentation_transform = augmentation_transform)

train_dl = DataLoader(train_data, batch_size=64, shuffle=True)

The train loop is the following one:

def train(model, optimizer, epochs: int, train_dl, valid_dl = None,  beta = 3.33, gamma = 2.5,
          save_path = None, patience = 15, scheduler = None, checkpoint = None, checkpoint_sp = None): 
    
    assert save_path is not None and os.path.exists(os.path.dirname(save_path)), "In order to save weights you must provide a valid path"

    
    if checkpoint is None:
        best_valid_loss = 0
        loss_set = False
        best_params = None
        no_improv_epochs = 0
        starting_epoch = 0
    else:
        best_valid_loss = checkpoint['bvl']
        starting_epoch = checkpoint['epoch'] + 1 # epoch contiene l'ultima epoca di train effettuata
        best_params = checkpoint['best_params']
        loss_set = True
        no_improv_epochs = checkpoint['no_imp_epoch']
        best_epoch = checkpoint['best_epoch']
        
    for epoch in range(starting_epoch,epochs):
        model.train()
        train_loss = 0
        train_samples = 0

        bn = len(train_dl)
        loop = tqdm(train_dl)
        loop.set_description(f'Epoch: {epoch+1} / {epochs}')

        for data in loop:
            inputs, targets = data
            optimizer.zero_grad()
            # Forward pass
            outputs = model.forward(inputs)
            outputs = outputs.unsqueeze(dim=-1)
            outputs = outputs.view(-1, 10, 1)
            
            loss = batch_focal_r_loss(outputs, targets, beta = beta, gamma = gamma)
            train_samples += data[0].shape[0]
            train_loss += loss * data[0].shape[0]

            # Backward pass and optimize
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
        
        # validation
        train_loss /= train_samples
        
        model.eval()
        with torch.no_grad():
            valid_loss = 0
            valid_samples = 0
            
            if valid_dl is not None:
                for valid_data in valid_dl:
                    val_inputs, val_targets = valid_data
                    valid_scores = model.forward(val_inputs)
                    valid_scores = valid_scores.unsqueeze(dim=-1)
                    valid_scores = valid_scores.view(-1, 10, 1)
                    
                    vl = batch_focal_r_loss(valid_scores, val_targets, beta = beta, gamma = gamma)
                    valid_loss +=  ( vl * val_inputs.shape[0] )        
                    valid_samples += val_inputs.shape[0]
                
                valid_loss /= valid_samples
            
            if scheduler:
                scheduler.step(valid_loss)
            
            # Print the information of the training epoch
            epochInfo = f" train loss {train_loss.item():.3f}"
            if valid_dl:
                epochInfo += f" validation loss: {valid_loss.item():.3f} "
            
            print(epochInfo)
            
            if valid_dl is None:
                best_params = model.state_dict()
                best_epoch = epoch
            
            elif valid_loss < best_valid_loss or not loss_set:
                best_valid_loss = valid_loss 
                loss_set = True
                best_params = model.state_dict()
                best_epoch = epoch + 1
                no_improv_epochs = 0
            else: 
                no_improv_epochs += 1
                # Early stopping
                if no_improv_epochs == patience:
                    print(f"There have been no improvements in the last {patience} epochs, so the training will stop.")
                    break

        
        torch.save({
            'epoch': epoch,
            'best_params': best_params,
            'best_epoch': best_epoch,
            'bvl': best_valid_loss,
            'no_imp_epoch' : no_improv_epochs,
            
            'model': model.state_dict(),
            'optim': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()
        }, checkpoint_sp)
    
    torch.save(best_params, save_path)
    print("Weights correctly saved in {}".format(save_path))
    if valid_dl:
        print(f"The best epoch is epoch {best_epoch} with a validation_loss of: {best_valid_loss.item()} ")
    
    return best_params
   

And the code for the loss is the following:

def emd_loss(p, t, r=2):

  assert len(p) == len(t), "The distributions must have the same length"

  loss = 0.0
  cdf_p = 0.0
  cdf_t = 0.0

  for elem_p, elem_t in zip(p,t):
    cdf_p += elem_p
    cdf_t += elem_t
    loss += torch.abs(cdf_p - cdf_t) ** r

  loss = loss ** (1. /r)

  return  loss

def focal_r_loss(p,t,beta, gamma):
  e = emd_loss(p,t)

  res = beta * e
  res = torch.sigmoid(res)
  res = res ** gamma

  res = res * e

  return res


def batch_focal_r_loss(p,t, beta = 3.33, gamma = 2, reduction = 'mean'):

  assert p.shape == t.shape, "The shapes must be the same, got {} and {}".format(p.shape, t.shape)

  assert reduction in ['mean', 'none'], "The reduction modality must be one of : [ 'mean', 'none' ], got {} ".format(reduction)

  batch_dim = p.shape[0]
  losses = []

  for i in range(batch_dim):
    losses.append(focal_r_loss(p[i], t[i], beta, gamma))


  if reduction == 'mean':
    tmp = sum(losses) / batch_dim
    losses = tmp

  # If reduction is None return the list as it is

  return losses

When training the model with the whole dataset (it is made of more than 200k images) I notice two weird behaviors:

  • The RAM usage keeps increasing until the whole RAM is full and the training stops
  • The GPU usage is below 50%

Can you help me finding a way to fix the first problem? (The ram one)… I’ll also add the code for the call of the training method too

# Train the model
network = mobilenet_v3_large(weights = "IMAGENET1K_V2")
model = NIMA(network).to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4 )
scheduler =  torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)

save_path = os.path.join(os.getcwd(), "Nima_wholeDataset.pt")
checkpoint_sp = os.path.join(os.getcwd(), "checkpoint.pt")
num_epochs = 50

if os.path.isfile(checkpoint_sp):
    checkpoint = torch.load(checkpoint_sp)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optim'])
    scheduler.load_state_dict(checkpoint['scheduler'])
else:
    checkpoint = None


best_weights = train(model ,optimizer, num_epochs ,train_dl, val_dl, save_path=save_path, 
                     scheduler = scheduler, checkpoint = checkpoint, checkpoint_sp = checkpoint_sp )

You are storing all computation graph including all intermediate tensors by accumulating the loss:

train_loss += loss * data[0].shape[0]

Either .detach() the loss tensor or call .item() on it before accumulating it into train_loss.