Stable Increase of Cache Memory During Training

SoHardToMakeAName · March 2, 2021, 8:48am

Hi, I am training my own model using the data pipeline provided by Background-Matting and during training the cache memory (CPU memory) increases stably and sometimes runs out at the end. So my process is often killed with bus error. And I try a dummy script to test the dataloader:

data_config_train = {'reso': [512, 512], 'trimapK': [5, 5],
                    'noise': True}
train_dataset = AdobeDataAffineHR("Data_adobe/Adobe_train_data_50.csv", data_config_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, num_workers=0, shuffle=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for epoch in range(10):
    for i, data in enumerate(train_loader):
        fg, bg, alpha, image, bg_tr = data['fg'].to(device), data['bg'].to(device), data['alpha'].to(device),\
                                      data['image'].to(device), data['bg_tr'].to(device)
        if (i+1)%50 == 0:
            print("{} iterations finished!".format(i+1))
    print("{} epochs finished!".format(epoch+1))

And the AdobeDataAffineHR code is here:

class AdobeDataAffineHR(Dataset):
    def __init__(self, csv_file, data_config, transform=None):
        frames = pd.read_csv(csv_file, sep=';')
        self.frames = np.array(frames)
        self.transform = transform
        self.resolution = data_config['reso']
        self.trimapK = data_config['trimapK']
        self.noise = data_config['noise']

    def __len__(self):
        return len(self.frames)



    def __getitem__(self, idx):
        # try:
            # load
        cv2.setNumThreads(0)
        cv2.ocl.setUseOpenCL(False)
             fg = io.imread(self.frames[idx, 0])
             alpha = io.imread(self.frames[idx, 1])
             image = io.imread(self.frames[idx, 2])
             back = io.imread(self.frames[idx, 3])

             fg = cv2.resize(fg, dsize=(800, 800))
             alpha = cv2.resize(alpha, dsize=(800, 800))
             back = cv2.resize(back, dsize=(800, 800))
             image = cv2.resize(image, dsize=(800, 800))

        sz = self.resolution

        # random flip
        if np.random.random_sample() > 0.5:
            alpha = cv2.flip(alpha, 1)
            fg = cv2.flip(fg, 1)
            back = cv2.flip(back, 1)
            image = cv2.flip(image, 1)

        trimap = generate_trimap(alpha, self.trimapK[0], self.trimapK[1], False)

        # randcom crop+scale
        different_sizes = [(576, 576), (608, 608), (640, 640), (672, 672), (704, 704), (736, 736), (768, 768),
                               (800, 800)]
        crop_size = random.choice(different_sizes)

        x, y = random_choice(trimap, crop_size)

        fg = safe_crop(fg, x, y, crop_size, sz)
        alpha = safe_crop(alpha, x, y, crop_size, sz)
        image = safe_crop(image, x, y, crop_size, sz)
        back = safe_crop(back, x, y, crop_size, sz)
        trimap = safe_crop(trimap, x, y, crop_size, sz)

        fg, alpha, image, back = fg.astype(np.uint8), alpha.astype(np.uint8), \
                                     image.astype(np.uint8), back.astype(np.uint8)

        # Perturb Background: random noise addition or gamma change
        if self.noise:
            if np.random.random_sample() > 0.6:
                sigma = np.random.randint(low=2, high=6)
                mu = np.random.randint(low=0, high=14) - 7
                back_tr = add_noise(back, mu, sigma)
            else:
                back_tr = skimage.exposure.rescale_intensity(back, out_range=(0, 255))
                back_tr = skimage.exposure.adjust_gamma(back_tr, np.random.normal(1, 0.12))
                back_tr = back_tr.astype(np.uint8)

        sample = {'image': to_tensor(image), 'fg': to_tensor(fg), 'alpha': to_tensor(alpha), 'bg': to_tensor(back),
                    'bg_tr': to_tensor(back_tr), 'trimap': to_tensor(trimap)}

        if self.transform:
            sample = self.transform(sample)

        return sample
        # except Exception as e:
        #     print("Error loading: " + self.frames[idx, 3])
        #     print(e)

And the functions used above are the same as the BackgroundMatting project. I run the dummy script and can see a fast increase of cache memory (about 1GB per 50 mini-batches) Also, I try several libs (PIL, skimage, cv2 and imageio) to read images but the phenomenon is the same.

I doubt it is file cache causes this memory consumption, but what’s weird is that cache memory still increases after one epoch, when on theory all images are cached. And I am using a conda virtual env on Ubuntu system with 128 GB RAM, and basic information are listed below:

python 3.6
pytorch 1.1.0
numpy==1.17.0
opencv-python==3.4.5.20
pandas
Pillow==6.1
scikit-image==0.14.2
scipy==1.2.1
tqdm
tensorboardX