I am trying to train a CNN using images and instead of dividing the images into different folders for train and validation sets, I defined my own Dataset subclass as follows:
class TrainDataset(Dataset):
def __init__(self, root, transform=None, labels_df=None, image_filenames = [],
augmentation_transform=None, score_th = 7.0):
assert labels_df is not None, "please provide the ground truth"
assert len(image_filenames) > 0, 'you must provide the list of filenames'
self.root = root
self.transform = transform
self.labels_df = labels_df
self.image_filenames = image_filenames
self.augmentation_transform = augmentation_transform
self.score_th = score_th
def __len__(self):
return len(self.image_filenames)
def __getitem__(self, index):
# load the image
image_path = os.path.join(self.root, self.image_filenames[index])
image_num = int(self.image_filenames[index].split(".")[0])
image = Image.open(image_path).convert('RGB')
# apply the transform
if self.transform:
# Transformations on all the images
image = self.transform(image)
# load the score for the image from the DataFrame
row = self.labels_df[self.labels_df['image_num'] == image_num]
labels = torch.tensor(row.iloc[:,1:].values, dtype=torch.float32)
labels = labels.permute(1,0).to(device)
score = compute_score(labels)
if score >= self.score_th and self.augmentation_transform:
image = self.augmentation_transform(image)
return image.to(device), labels
class ValDataset(Dataset):
def __init__(self, root, transform=None, labels_df=None, images_filenames = []):
assert labels_df is not None, "please provide the ground truth"
assert len(images_filenames) > 0, 'you must provide the list of filenames'
self.root = root
self.transform = transform
self.labels_df = labels_df
self.image_filenames = images_filenames
def __len__(self):
return len(self.image_filenames)
def __getitem__(self, index):
# load the image
image_path = os.path.join(self.root, self.image_filenames[index])
image_num = int(self.image_filenames[index].split(".")[0])
image = Image.open(image_path).convert('RGB')
# apply the transform
if self.transform:
image = self.transform(image)
row = self.labels_df[self.labels_df['image_num'] == image_num]
labels = torch.tensor(row.iloc[:,1:].values, dtype=torch.float32)
labels = labels.permute(1,0).to(device)
return image.to(device), labels
Then when creating the DataLoaders object the code is the following:
val_transform = transforms.Compose([
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
val_data = ValDataset(images_dir, transform=val_transform, labels_df = labels_df, images_filenames = val_filenames)
val_dl = DataLoader(val_data, batch_size=64, shuffle=False)
# Define the transforms to be applied to the image
train_transform = transforms.Compose([
transforms.Resize((256,256)),
transforms.RandomCrop((224,224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
augmentation_transform = transforms.Compose([
transforms.RandomHorizontalFlip(0.7)
])
# create the custom dataset using the images folder and the labels DataFrame
train_data = TrainDataset(images_dir, transform=train_transform,
labels_df= labels_df, image_filenames = train_filenames,
augmentation_transform = augmentation_transform)
train_dl = DataLoader(train_data, batch_size=64, shuffle=True)
The train loop is the following one:
def train(model, optimizer, epochs: int, train_dl, valid_dl = None, beta = 3.33, gamma = 2.5,
save_path = None, patience = 15, scheduler = None, checkpoint = None, checkpoint_sp = None):
assert save_path is not None and os.path.exists(os.path.dirname(save_path)), "In order to save weights you must provide a valid path"
if checkpoint is None:
best_valid_loss = 0
loss_set = False
best_params = None
no_improv_epochs = 0
starting_epoch = 0
else:
best_valid_loss = checkpoint['bvl']
starting_epoch = checkpoint['epoch'] + 1 # epoch contiene l'ultima epoca di train effettuata
best_params = checkpoint['best_params']
loss_set = True
no_improv_epochs = checkpoint['no_imp_epoch']
best_epoch = checkpoint['best_epoch']
for epoch in range(starting_epoch,epochs):
model.train()
train_loss = 0
train_samples = 0
bn = len(train_dl)
loop = tqdm(train_dl)
loop.set_description(f'Epoch: {epoch+1} / {epochs}')
for data in loop:
inputs, targets = data
optimizer.zero_grad()
# Forward pass
outputs = model.forward(inputs)
outputs = outputs.unsqueeze(dim=-1)
outputs = outputs.view(-1, 10, 1)
loss = batch_focal_r_loss(outputs, targets, beta = beta, gamma = gamma)
train_samples += data[0].shape[0]
train_loss += loss * data[0].shape[0]
# Backward pass and optimize
loss.backward()
optimizer.step()
optimizer.zero_grad()
# validation
train_loss /= train_samples
model.eval()
with torch.no_grad():
valid_loss = 0
valid_samples = 0
if valid_dl is not None:
for valid_data in valid_dl:
val_inputs, val_targets = valid_data
valid_scores = model.forward(val_inputs)
valid_scores = valid_scores.unsqueeze(dim=-1)
valid_scores = valid_scores.view(-1, 10, 1)
vl = batch_focal_r_loss(valid_scores, val_targets, beta = beta, gamma = gamma)
valid_loss += ( vl * val_inputs.shape[0] )
valid_samples += val_inputs.shape[0]
valid_loss /= valid_samples
if scheduler:
scheduler.step(valid_loss)
# Print the information of the training epoch
epochInfo = f" train loss {train_loss.item():.3f}"
if valid_dl:
epochInfo += f" validation loss: {valid_loss.item():.3f} "
print(epochInfo)
if valid_dl is None:
best_params = model.state_dict()
best_epoch = epoch
elif valid_loss < best_valid_loss or not loss_set:
best_valid_loss = valid_loss
loss_set = True
best_params = model.state_dict()
best_epoch = epoch + 1
no_improv_epochs = 0
else:
no_improv_epochs += 1
# Early stopping
if no_improv_epochs == patience:
print(f"There have been no improvements in the last {patience} epochs, so the training will stop.")
break
torch.save({
'epoch': epoch,
'best_params': best_params,
'best_epoch': best_epoch,
'bvl': best_valid_loss,
'no_imp_epoch' : no_improv_epochs,
'model': model.state_dict(),
'optim': optimizer.state_dict(),
'scheduler': scheduler.state_dict()
}, checkpoint_sp)
torch.save(best_params, save_path)
print("Weights correctly saved in {}".format(save_path))
if valid_dl:
print(f"The best epoch is epoch {best_epoch} with a validation_loss of: {best_valid_loss.item()} ")
return best_params
And the code for the loss is the following:
def emd_loss(p, t, r=2):
assert len(p) == len(t), "The distributions must have the same length"
loss = 0.0
cdf_p = 0.0
cdf_t = 0.0
for elem_p, elem_t in zip(p,t):
cdf_p += elem_p
cdf_t += elem_t
loss += torch.abs(cdf_p - cdf_t) ** r
loss = loss ** (1. /r)
return loss
def focal_r_loss(p,t,beta, gamma):
e = emd_loss(p,t)
res = beta * e
res = torch.sigmoid(res)
res = res ** gamma
res = res * e
return res
def batch_focal_r_loss(p,t, beta = 3.33, gamma = 2, reduction = 'mean'):
assert p.shape == t.shape, "The shapes must be the same, got {} and {}".format(p.shape, t.shape)
assert reduction in ['mean', 'none'], "The reduction modality must be one of : [ 'mean', 'none' ], got {} ".format(reduction)
batch_dim = p.shape[0]
losses = []
for i in range(batch_dim):
losses.append(focal_r_loss(p[i], t[i], beta, gamma))
if reduction == 'mean':
tmp = sum(losses) / batch_dim
losses = tmp
# If reduction is None return the list as it is
return losses
When training the model with the whole dataset (it is made of more than 200k images) I notice two weird behaviors:
- The RAM usage keeps increasing until the whole RAM is full and the training stops
- The GPU usage is below 50%
Can you help me finding a way to fix the first problem? (The ram one)… I’ll also add the code for the call of the training method too
# Train the model
network = mobilenet_v3_large(weights = "IMAGENET1K_V2")
model = NIMA(network).to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4 )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
save_path = os.path.join(os.getcwd(), "Nima_wholeDataset.pt")
checkpoint_sp = os.path.join(os.getcwd(), "checkpoint.pt")
num_epochs = 50
if os.path.isfile(checkpoint_sp):
checkpoint = torch.load(checkpoint_sp)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optim'])
scheduler.load_state_dict(checkpoint['scheduler'])
else:
checkpoint = None
best_weights = train(model ,optimizer, num_epochs ,train_dl, val_dl, save_path=save_path,
scheduler = scheduler, checkpoint = checkpoint, checkpoint_sp = checkpoint_sp )