Custom Dataset , Pytorch Data loader requires lot of ram

Hello guys, I need help
I created a custom Dataset using PyTorch which in the getitem function I load images and make batch by batch and when Im using the training for loop the ram usage gradually increases
images are 640x640 and masks are 320x320 and it will take like 300 images to fill up the ram
and its has nothing to do with pre-fetch dataset loading because I tested without it too.

import torch
from torch.utils.data import Dataset, DataLoader
from prefetch_generator import BackgroundGenerator

class CustomDataset(Dataset):
    def __init__(self, data_path, label_path, image_size=(720, 1280), normalize=True, class_mapping=None, batch_size=8):
        self.data_loader = CustomDataLoader(data_path, label_path, image_size, normalize, class_mapping, batch_size)
        print("one")
        self.batch_size = batch_size
        self.image_size =  image_size

    def __len__(self):
        return self.data_loader.num_samples // self.batch_size

    def __getitem__(self, index):
        start_idx = index * self.batch_size
        end_idx = min((index + 1) * self.batch_size, self.data_loader.num_samples)
        batch_annotations = self.data_loader.annotations[start_idx:end_idx]

        batch_images, batch_masks = self.data_loader.process_batch(batch_annotations)

        # Convert lists to numpy arrays and stack the masks
        # batch_images = np.array(batch_images)
        # batch_masks = np.stack(batch_masks, axis=0)

        # Transpose batch_images to shape [batch_size, C, H, W]
        batch_images = torch.tensor(batch_images.transpose(0, 3, 1, 2), dtype=torch.float32)

        # Remove the extra dimension at index 1 from batch_masks
        batch_masks = torch.tensor(batch_masks[:, 0, :, :], dtype=torch.float32)

        # Stack the second dimension to batch_masks
        batch_masks = torch.stack((batch_masks, 1 - batch_masks), dim=1)

        return batch_images, batch_masks

class DataLoaderX(DataLoader):
    """prefetch dataloader"""
    def __iter__(self):
        return BackgroundGenerator(super().__iter__())

dataset = CustomDataset(data_path, label_path, image_size=(640, 640), normalize=True, class_mapping=class_mapping)
batch_size = 1  # Choose your desired batch size
train_data_loader = DataLoaderX(dataset, batch_size=batch_size, shuffle=False,pin_memory=False, num_workers=0)
# Example of how to iterate through the data loader during training
print(len(train_data_loader))
for i, (batch_images, masks) in enumerate(train_data_loader):
    # Here you can use the batch_images, drivable_masks, and lane_masks for training
    # Remember that the batch size is determined by the 'batch_size' parameter you set above
    # Perform your training process here
    try:
        print(batch_images[:, 0, :, :].shape)
        print(masks[:, 0, :, :].shape)
        # print(lane_masks.shape)
        print(i * batch_size)  # Print the starting index of each batch
        
    except:
        continue

Could you explain why the custom DataLoader inside the Dataset does?
Do you see the increase in memory after removing it?