After monitoring CPU RAM usage, I find that RAM usage increases for all epoch. During an epoch run, memory keeps constantly increasing. RAM isn’t freed after epoch ends. Usage keeps increasing when new epoch comes. Hence, memory usage doesn’t become constant after running first epoch as it should have. Eventually after some epochs, this leads to OOM error on CPU. To be noted that GPU memory stays constant after first epoch.
From reading related posts I believe the problem is in custom Dataset implementation, although I can’t point out where. I gather all image file names in a list, save it as csv and then use loaded dataframe from csv in MyDataset class to load data, since people on forum have cautioned against using python lists directly to load data.
Some things I have tried but didn’t work:
- Setting num_workers = 0
- Using numpy array of file names instead of python list to load data
- Decreased batch size to 1
For 100k images and batch size 50, this error comes at 25th epoch, for 50k images it came at 50th epoch.
This is a 2x super resolution script with (16,16,4) input images and (32,32,4) output images.
Code:
- train.py -> Main training loop and gathering file names (added here)
- progressive_loader.py -> Custom dataset implementation (added here)
- prosrs.yaml -> hyperparameters & configuration file
- generators.py -> model (modified densenet kinda architecture ~50+ layers)
train.py:
def load_dataset(args):
files = {'train':{},'test':{}}
for phase in ['train','test']:
for ft in ['source','target']:
if args[phase].dataset.path[ft]:
files[phase][ft] = get_filenames(
args[phase].dataset.path[ft], image_format=IMG_EXTENSIONS)
else:
files[phase][ft] = []
return files['train'],files['test']
def main(args):
############### loading datasets #################
train_files,test_files = load_dataset(args)
print("Dataset images retrieved")
num_images_to_train = 100000
train_files['target'] = train_files['target'][:num_images_to_train]
test_files['target'] = train_files['target'].copy()
with open('imagepaths.csv', "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in train_files['target']:
writer.writerow([val])
# Dataset passing
training_dataset = MyDataset(
prosr.Phase.TRAIN,
scale=args.data.scale,
input_size=args.data.input_size,
args=args,
**args.train.dataset)
training_data_loader = DataLoader(
training_dataset, batch_size=args.train.batch_size, shuffle=False, num_workers=4)
if len(test_files['target']):
testing_dataset = MyDataset(
prosr.Phase.VAL,
scale=args.data.scale,
input_size=None,
args=args,
**args.test.dataset)
testing_data_loader = DataLoader(testing_dataset, batch_size=1, shuffle=False, num_workers=4)
else:
testing_dataset = None
testing_data_loader = None
start_epoch = 0
lr = args.train.lr
# save_dir = args.cmd.output
steps_per_epoch = len(training_data_loader)
total_steps = start_epoch * steps_per_epoch
############# start training ##############
batchsize = args.train.batch_size
print("Batch size = ", batchsize)
print("Num batches size = ", len(training_data_loader))
loss = []
psnr_list = []
# output_imgs = torch.zeros((len(trainer.training_dataset)*batchsize, 4, 32, 32))
num_random = 100
HR_imgs = torch.zeros((num_random, 4, 32, 32))
output_imgs = torch.zeros((num_random, 4, 32, 32))
random_indices = randint(0, (len(training_data_loader)*batchsize)-1, num_random)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
args.G.max_scale = max(args.data.scale)
net_G = ProSR(**args.G).cuda()
optimizer_G = torch.optim.Adam(
[p for p in net_G.parameters() if p.requires_grad],
lr=args.train.lr,
betas=(0.9, 0.999),
eps=1.0e-08)
l1_criterion = torch.nn.L1Loss()
#########################################################################
for epoch in range(start_epoch + 1, args.train.epochs + 1):
iter_start_time = time()
epoch_start_time = time()
net_G.train()
epoch_loss = 0
print("Epoch: ", epoch)
for i, data in enumerate(training_data_loader):
# Forward and backward pass
lr = data['input'].cuda()
hr = data['target'].cuda()
interpolated = data['bicubic'].cuda()
output_batch = net_G(lr, upscale_factor=2) + interpolated
optimizer_G.zero_grad()
l1_loss = l1_criterion(output_batch, hr)
l1_loss.backward()
optimizer_G.step()
epoch_loss += l1_loss
total_steps += 1
#################################################################
progressive_loader.py:
def pil_loader(path, args, mode='RGBA'):
with open(full_path, 'rb') as f:
with Image.open(f) as img:
return img.convert(mode)
def downscale_by_ratio(img, ratio=2, method=Image.BICUBIC):
if ratio == 1:
return img
w, h = img.size
w, h = floor(w / ratio), floor(h / ratio)
return img.resize((w, h), method)
class MyDataset(Dataset):
def __init__(self, phase, scale, input_size, args, mean,
stddev, downscale, **kwargs):
self.phase = phase
self.scale = 2
self.mean = mean
self.stddev = stddev
self.args = args
self.image_loader = pil_loader
self.downscale=downscale
self.data_frame = pd.read_csv("imagepaths.csv")
# Input normalization
self.normalize_fn = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(self.mean, self.stddev)
])
def __len__(self):
return len(self.data_frame)
def __getitem__(self, index):
return self.get(index)
def get(self, index, scale=2):
if torch.is_tensor(index):
index = index.tolist()
scale = 2
ret_data = {}
ret_data['scale'] = scale
# Load target image
if len(self.data_frame):
target_img = self.image_loader(self.data_frame.iloc[index, 0], self.args)
ret_data['target'] = target_img
ret_data['target_fn'] = self.data_frame.iloc[index, 0]
ret_data['input'] = downscale_by_ratio(
ret_data['target'], scale, method=Image.BICUBIC)
ret_data['input_fn'] = self.data_frame.iloc[index, 0]
# Change Image.BICUBIC to Image.BILINEAR
ret_data['bicubic'] = downscale_by_ratio(
ret_data['input'], 1 / scale, method=Image.BICUBIC)
ret_data['input'] = self.normalize_fn(ret_data['input'])
ret_data['bicubic'] = self.normalize_fn(ret_data['bicubic'])
if len(self.data_frame):
ret_data['target'] = self.normalize_fn(ret_data['target'])
return ret_data
Can someone suggest where is the problem?