Hello.
I’m having a problem with constant training loss.
Specifically, I am in the process of segmentation in MRI Image using U-Net.
the data covers about 100,000 slices of grayscale 32x32size.
Data is randomly called for each epoch and the learning is repeated. (ex. 10 numpy files in total, 10 learning in one epoch and 1 validation)
The essence of the problem is that after approximately 3 epochs, I always get the same value of train loss.
Things I have tried:
“data pre-processing”
image = image*255/image.max()
image = image/(image.max()+0.00001)
-
image = image*255/image.max()
+
image = image/(image.max()+0.00001)
“Remove BatchNorm in Network”
In U-Net’s double conv part,
Used nn.BatchNorm2d after each Conv2d
Didn't used nn.BatchNorm2d after each Conv2d
“Learning Rate & Optimizer”
Used SGD or Adam
Used learning rates in the range of 0.00001 to 0.5
“etc…”
I wrote down the code of my custom dataset, u-net network, train / valid loop, etc. below.
Custom dataset
class eDataset(torch.utils.data.Dataset):
def __init__(self, i, data_path, augmentation=True):
self.data_path = data_path
self.data = np.load(data_path+'Patch_images_{}.npy'.format(i)).astype(np.uint16)
self.target = np.load(data_path+'Patch_Tumor_{}.npy'.format(i)).astype(np.uint8)
self.augmentation = augmentation
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x, y = self.transform(x, y)
return x, y
def transform(self, data, target):
data, target = train_data(data, target, self.augmentation)
return data, target
def __len__(self):
return len(self.data)
def train_data(image, mask, aug=True):
image = Image.fromarray(image)
mask = Image.fromarray(mask)
image = TF.to_tensor(image).float()
image = image/(image.max()+0.00001)
mask = binarize(TF.to_tensor(mask)).float()
return image, mask
U-Net Network & Hyper Parameters
def double_conv(in_channels, out_channels):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, padding=1),
#nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, 3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
class UNet(nn.Module):
def __init__(self, n_class):
super().__init__()
self.dconv_down1 = double_conv(1, 32)
self.dconv_down2 = double_conv(32, 64)
self.dconv_down3 = double_conv(64, 128)
self.dconv_down4 = double_conv(128, 256)
self.maxpool = nn.MaxPool2d(2)
self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
self.dconv_up3 = double_conv(128 + 256, 128)
self.dconv_up2 = double_conv(64 + 128, 64)
self.dconv_up1 = double_conv(32 + 64, 32)
self.conv_last = nn.Conv2d(32, n_class, 1)
def forward(self, x):
conv1 = self.dconv_down1(x)
x = self.maxpool(conv1)
conv2 = self.dconv_down2(x)
x = self.maxpool(conv2)
conv3 = self.dconv_down3(x)
x = self.maxpool(conv3)
x = self.dconv_down4(x)
x = self.upsample(x)
x = torch.cat([x, conv3], dim=1)
x = self.dconv_up3(x)
x = self.upsample(x)
x = torch.cat([x, conv2], dim=1)
x = self.dconv_up2(x)
x = self.upsample(x)
x = torch.cat([x, conv1], dim=1)
x = self.dconv_up1(x)
out = self.conv_last(x)
return out
model = UNet(n_class=2)
model = model
if torch.cuda.is_available():
model = model.cuda()
class_weights = torch.tensor([1.0, 1.0]).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)
optimizer = optim.SGD(model.parameters(),lr=0.00001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
Train/Valid loop
init_state = copy.deepcopy(model.state_dict())
init_state_opt = copy.deepcopy(optimizer.state_dict())
init_state_lr = copy.deepcopy(exp_lr_scheduler.state_dict())
since = time.time()
train_losses = []
val_losses = []
early_stopping = EarlyStopping(patience=5, verbose=1)
for epoch in range(num_epochs):
print()
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
epoch_loss = train_fit(epoch,model,phase='train')
val_epoch_loss = valid_fit(epoch,model,validloader,phase='valid')
train_losses.append(epoch_loss)
val_losses.append(val_epoch_loss)
if early_stopping.validate(val_epoch_loss):
break
def train_fit(epoch,model,phase='train',volatile=False):
torch.set_num_threads(4)
epoch_loss = 0.0
model.train().to(device)
patient_index = list(range(1,11))
for i in range(10):
secure_random = random.SystemRandom()
random_patient = secure_random.choice(patient_index)
train_datasets = trainDataset(random_patient,"data_path/",augmentation=True)
patient_index.remove(random_patient)
data_loader = torch.utils.data.DataLoader(train_datasets, batch_size = batch_size, shuffle=True, num_workers=0, pin_memory=False)
running_loss = 0.0
for batch_idx , (data,target) in enumerate(data_loader):
inputs,target = data.to(device),target.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
output = model(inputs).to(device)
loss = criterion(output,target.long()).to(device)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += loss.item()*inputs.size(0)
if phase == 'train':
exp_lr_scheduler.step()
loss = running_loss/len(data_loader.dataset)
epoch_loss += loss
epoch_loss = epoch_loss/10
print('{} Loss: {:.4f}'.format(phase, epoch_loss))
return epoch_loss
def valid_fit(epoch,model,data_loader,phase='train',volatile=False):
torch.set_num_threads(4)
if phase == 'train':
model.train().to(device)
if phase == 'valid':
model.eval().to(device)
running_loss = 0.0
for batch_idx , (data,target) in enumerate(data_loader):
inputs,target = data.to(device),target.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
output = model(inputs).to(device)
loss = criterion(output,target.long()).to(device)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += loss.item()*inputs.size(0)
if phase == 'train':
exp_lr_scheduler.step()
loss = running_loss/len(data_loader.dataset)
print('{} Loss: {:.4f}'.format(
phase, loss))
return loss
The content is rather long, but if there are any parts I am missing or I am making mistakes, I would appreciate any help.
Thanks!