Hi, I am training my own model using the data pipeline provided by Background-Matting and during training the cache memory (CPU memory) increases stably and sometimes runs out at the end. So my process is often killed with bus error. And I try a dummy script to test the dataloader:
data_config_train = {'reso': [512, 512], 'trimapK': [5, 5],
'noise': True}
train_dataset = AdobeDataAffineHR("Data_adobe/Adobe_train_data_50.csv", data_config_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, num_workers=0, shuffle=True)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
for epoch in range(10):
for i, data in enumerate(train_loader):
fg, bg, alpha, image, bg_tr = data['fg'].to(device), data['bg'].to(device), data['alpha'].to(device),\
data['image'].to(device), data['bg_tr'].to(device)
if (i+1)%50 == 0:
print("{} iterations finished!".format(i+1))
print("{} epochs finished!".format(epoch+1))
And the AdobeDataAffineHR code is here:
class AdobeDataAffineHR(Dataset):
def __init__(self, csv_file, data_config, transform=None):
frames = pd.read_csv(csv_file, sep=';')
self.frames = np.array(frames)
self.transform = transform
self.resolution = data_config['reso']
self.trimapK = data_config['trimapK']
self.noise = data_config['noise']
def __len__(self):
return len(self.frames)
def __getitem__(self, idx):
# try:
# load
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)
fg = io.imread(self.frames[idx, 0])
alpha = io.imread(self.frames[idx, 1])
image = io.imread(self.frames[idx, 2])
back = io.imread(self.frames[idx, 3])
fg = cv2.resize(fg, dsize=(800, 800))
alpha = cv2.resize(alpha, dsize=(800, 800))
back = cv2.resize(back, dsize=(800, 800))
image = cv2.resize(image, dsize=(800, 800))
sz = self.resolution
# random flip
if np.random.random_sample() > 0.5:
alpha = cv2.flip(alpha, 1)
fg = cv2.flip(fg, 1)
back = cv2.flip(back, 1)
image = cv2.flip(image, 1)
trimap = generate_trimap(alpha, self.trimapK[0], self.trimapK[1], False)
# randcom crop+scale
different_sizes = [(576, 576), (608, 608), (640, 640), (672, 672), (704, 704), (736, 736), (768, 768),
(800, 800)]
crop_size = random.choice(different_sizes)
x, y = random_choice(trimap, crop_size)
fg = safe_crop(fg, x, y, crop_size, sz)
alpha = safe_crop(alpha, x, y, crop_size, sz)
image = safe_crop(image, x, y, crop_size, sz)
back = safe_crop(back, x, y, crop_size, sz)
trimap = safe_crop(trimap, x, y, crop_size, sz)
fg, alpha, image, back = fg.astype(np.uint8), alpha.astype(np.uint8), \
image.astype(np.uint8), back.astype(np.uint8)
# Perturb Background: random noise addition or gamma change
if self.noise:
if np.random.random_sample() > 0.6:
sigma = np.random.randint(low=2, high=6)
mu = np.random.randint(low=0, high=14) - 7
back_tr = add_noise(back, mu, sigma)
else:
back_tr = skimage.exposure.rescale_intensity(back, out_range=(0, 255))
back_tr = skimage.exposure.adjust_gamma(back_tr, np.random.normal(1, 0.12))
back_tr = back_tr.astype(np.uint8)
sample = {'image': to_tensor(image), 'fg': to_tensor(fg), 'alpha': to_tensor(alpha), 'bg': to_tensor(back),
'bg_tr': to_tensor(back_tr), 'trimap': to_tensor(trimap)}
if self.transform:
sample = self.transform(sample)
return sample
# except Exception as e:
# print("Error loading: " + self.frames[idx, 3])
# print(e)
And the functions used above are the same as the BackgroundMatting project. I run the dummy script and can see a fast increase of cache memory (about 1GB per 50 mini-batches) Also, I try several libs (PIL, skimage, cv2 and imageio) to read images but the phenomenon is the same.
I doubt it is file cache causes this memory consumption, but what’s weird is that cache memory still increases after one epoch, when on theory all images are cached. And I am using a conda virtual env on Ubuntu system with 128 GB RAM, and basic information are listed below:
python 3.6
pytorch 1.1.0
numpy==1.17.0
opencv-python==3.4.5.20
pandas
Pillow==6.1
scikit-image==0.14.2
scipy==1.2.1
tqdm
tensorboardX