before I had only train and test dataset and I was getting a good prediction result for semantic segmentation with dice coeff of about 0.89. I decided to add the validation dataset and decay the lr but it didn’t learn anything and loss and accuracy are static number. I am wondering where I am doing wrong.
this is snippet for custom_dataset
import torch
from torch.utils.data.dataset import Dataset # For custom data-sets
import torchvision.transforms as transforms
from PIL import Image
import numpy
class CustomDataset(Dataset):
def __init__(self, image_paths, target_paths): # initial logic happens like transform
self.image_paths = image_paths
self.target_paths = target_paths
self.transforms = transforms.ToTensor()
self.mapping = {
0: 0,
255: 1
}
def mask_to_class(self, mask):
for k in self.mapping:
mask[mask==k] = self.mapping[k]
return mask
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
mask = Image.open(self.target_paths[index])
t_image = image.convert('L')
t_image = self.transforms(t_image)
#mask = torch.from_numpy(np.array(mask)) #this is for BMCC dataset
mask = torch.from_numpy(numpy.array(mask, dtype=numpy.uint8)) # this is for my dataset(lv)
mask = self.mask_to_class(mask)
mask = mask.long()
return t_image, mask, self.image_paths[index], self.target_paths[index]
def __len__(self): # return count of sample we have
return len(self.image_paths)
this is snippet for split the data to train, validation and test
from custom_dataset import CustomDataset
import torch
import glob
folder_data = glob.glob("D:\\Neda\\Pytorch\\U-net\\my_data\\imagesResized\\*.png")
folder_mask = glob.glob("D:\\Neda\\Pytorch\\U-net\\my_data\\labelsResized\\*.png")
# split these path using a certain percentage
len_data = len(folder_data)
print("count of dataset: ", len_data)
# count of dataset: 992
split_1 = int(0.8 * len(folder_data))
split_2 = int(0.9 * len(folder_data))
folder_data.sort()
train_image_paths = folder_data[:split_1]
print("count of train images is: ", len(train_image_paths))
#count of train images is: 793
valid_image_paths = folder_data[split_1:split_2]
print("count of validation image is: ", len(valid_image_paths))
#count of validation image is: 99
test_image_paths = folder_data[split_2:]
print("count of test images is: ", len(test_image_paths))
#count of test images is: 100
#print(test_image_paths)
train_mask_paths = folder_mask[:split_1]
valid_mask_paths = folder_mask[split_1:split_2]
test_mask_paths = folder_mask[split_2:]
train_dataset = CustomDataset(train_image_paths, train_mask_paths)
print(len(train_dataset[0]))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=2)
valid_dataset = CustomDataset(valid_image_paths, valid_mask_paths)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, shuffle=True, num_workers=2)
test_dataset = CustomDataset(test_image_paths, test_mask_paths)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=2)
dataLoaders = {
'train': train_loader,
'valid': valid_loader,
}
this is snippet for training which I assume I am doing something wrong here. In previous experiment the learning rate was 0.00001 with the same loss function and optimizer same as current snippet but that experiment didn’t have decay lr. The question is the issue is it because of the hyper-parameter isn’t suitable for this task or I am doing something wrong in this training snippet which I couldn’t find out so far.
from split_dataset import dataLoaders
from U_Net_demo import model, device
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.pyplot as plt
import time
import visdom
print("starting training")
#python -m visdom.server
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01) # 1e-3
# Decay LR by a factor of 0.1 every 20 epochs.
#step size: Period of learning rate decay.
#gamma = Multiplicative factor of learning rate decay. Default: 0.1, should float
scheduler = lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1)
def train():
since = time.time()
vis = visdom.Visdom()
loss_window = vis.line(X=torch.zeros((1,)).cpu(),
Y=torch.zeros((1)).cpu(),
opts=dict(xlabel='epoch',
ylabel='Loss',
title='Training Loss',
legend=['Loss']))
vis = visdom.Visdom()
accuracy_window = vis.line(X=torch.zeros((1,)).cpu(),
Y=torch.zeros((1)).cpu(),
opts=dict(xlabel='epoch',
ylabel='accuracy',
title='Training accuracy',
legend=['accuracy']))
num_epochs=150
for epoch in range(num_epochs): # loop over the dataset multiple times
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
scheduler.step()
model.train() # Set model to training mode
train_loss = 0.0
total_train = 0
correct_train = 0
#iterate over data
for t_image, mask, image_paths, target_paths in dataLoaders['train']:
# get the inputs
t_image = t_image.to(device)
mask = mask.to(device)
# zeroes the gradient buffers of all parameters
optimizer.zero_grad()
# forward
outputs = model(t_image)
_, predicted = torch.max(outputs.data, 1)
loss = criterion(outputs, mask) # calculate the loss
# backward + optimize only if in training phase
loss.backward() # back propagation
optimizer.step() # update gradients
# accuracy
train_loss += loss.item()
total_train += mask.nelement() # number of pixel in the batch
correct_train += predicted.eq(mask.data).sum().item() # sum all precited pixel values
train_epoch_loss = train_loss / len(dataLoaders['train'].dataset)
train_epoch_acc = 100 * (correct_train / total_train)
print ('|train loss: {:.4f}| train ACC: {:.4f}|'.format(train_epoch_loss, train_epoch_acc))
print('-' * 70)
vis.line(
X=torch.ones((1, 1)).cpu()*epoch,
Y=torch.Tensor([train_epoch_loss]).unsqueeze(0).cpu(),
win=loss_window,
update='append')
vis.line(
X=torch.ones((1, 1)).cpu()*epoch,
Y=torch.Tensor([train_epoch_acc]).unsqueeze(0).cpu(),
win=accuracy_window,
update='append')
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
torch.save(model.state_dict(), 'train_valid.pth')
this is train loss and train ACC which is static. train loss is started with a big number and
Epoch 0/149
|train loss: 571.1679| train ACC: 87.6056|
----------------------------------------------------------------------
Epoch 1/149
|train loss: 0.3584| train ACC: 88.5138|
----------------------------------------------------------------------
Epoch 2/149
|train loss: 0.3585| train ACC: 88.5138|
----------------------------------------------------------------------
Epoch 3/149
|train loss: 0.3582| train ACC: 88.5138|
----------------------------------------------------------------------
Epoch 4/149
|train loss: 0.3582| train ACC: 88.5138|
----------------------------------------------------------------------
Epoch 5/149
|train loss: 0.3579| train ACC: 88.5138|
----------------------------------------------------------------------
Epoch 6/149
|train loss: 0.3578| train ACC: 88.5138|
----------------------------------------------------------------------
Epoch 7/149
|train loss: 0.3574| train ACC: 88.5138|
----------------------------------------------------------------------
then after epoch 40 is fixed number till end.
Epoch 40/149
|train loss: 0.3566| train ACC: 88.5138|
----------------------------------------------------------------------