Hi, I have a problem with a project I’m developing with Pytorch (Autoencoders for anomaly detection).
I have a very large dataset composed of over 400000 images, each of size (256,256,4), and in order to handle it in an efficient way I decided to implement a custom Dataset.
The images are contained in a folder called DATASET, which contains two folders: one called “train”, which contains another folder called “clean” with all the images of the training set, and the other called “test”, inside of which there are other folders with images corresponding to specific labels.
The problem is that I train the network for 5 epochs, and the training loss after one epoch is around 0.185. In the following 4 epochs it decreases very slowly and and the end I got a loss that is around 0.1845, so from the first epoch the loss decreases of only 0.0005.
This is how I implement the Autoencoder:
class Encoder(nn.Module):
def __init__(self, code_size):
super().__init__()
self.encoder_conv = nn.Sequential(
nn.Conv2d(in_channels=4, out_channels=32, kernel_size=3, stride=2, padding=1),
# OUTPUT = (W-F+2P)/S + 1 = 256 - 3 + 2 / 2 +1 = 128
nn.ReLU(),
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
)
#flattening
self.flatten = nn.Flatten(start_dim=1)
#linear layer
self.encoder_lin = nn.Sequential(
nn.Linear(in_features=256*16*16, out_features=64), #we first define a fcl of size 64
nn.ReLU(),
nn.Linear(in_features=64, out_features=code_size) #we reduce its size defining another fcl with size = code_size
)
def forward(self, x):
x = self.encoder_conv(x)
x = self.flatten(x)
x = self.encoder_lin(x)
return x
and this is the decoder:
class Decoder(nn.Module):
def __init__(self, code_size):
super().__init__()
# linear part of the decoder
self.decoder_lin = nn.Sequential(
nn.Linear(in_features=code_size, out_features=64),
nn.ReLU(),
nn.Linear(in_features=64, out_features=16*16*256),
nn.ReLU()
)
# unflattening
self.unflatten = nn.Unflatten(dim=1, unflattened_size=(256,16,16))
# convolutional part of the decoder, where we execute the transpose convolution operation to undo the convolution
self.decoder_conv = nn.Sequential(
nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=0),
nn.ReLU(),
nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=3, stride=2, padding=0, output_padding=0),
nn.ReLU(),
nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=3, stride=2, padding=0, output_padding=0),
nn.ReLU(),
nn.ConvTranspose2d(in_channels=32, out_channels=4, kernel_size=3, stride=2, padding=0, output_padding=1),
nn.ReLU(),
)
def forward(self, x):
x = self.decoder_lin(x)
x = self.unflatten(x)
x = self.decoder_conv(x)
x = torch.sigmoid(x)
return x
This is how I check if the reconstructed image has the same size of the original one:
img, _ = train_dataset[0]
img = img.unsqueeze(0) # Add the batch dimension in the first axis
print('Before encoder:', img.shape)
img_enc = encoder(img)
print('After encoder:', img_enc.shape)
dec_img = decoder(img_enc)
print('After decoder:', dec_img.shape)
and the output is the following:
Before encoder: torch.Size([1, 4, 256, 256])
After encoder: torch.Size([1, 10])
After decoder: torch.Size([1, 4, 256, 256])
Then I define the parameters for the training
# definition of a single optimizer for both networks
learning_rate = 5e-4
params_to_optimize = [
{'params': encoder.parameters()},
{'params': decoder.parameters(), "lr": 0.1}
]
optim = torch.optim.Adam(params_to_optimize, lr=learning_rate, weight_decay=1e-5)
# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Moving the networks to the cpu
encoder.to(device)
decoder.to(device)
print("Device: ", device)
loss_function = torch.nn.MSELoss()
and this is the training function for a single epoch:
def training_single_epoch(encoder, decoder, loss_fn, device, dataloader, optimizer):
encoder.train()
decoder.train()
losses = []
for idx, (image_batch, _) in enumerate(dataloader): # iterating over the tuples returned by the dataloader (no need for lables)
print("\nIdx of training batch:", idx, "/", len(dataloader))
image_batch = image_batch.to(device)
encoded_batch = encoder(image_batch)
decoded_batch = decoder(encoded_batch)
loss_batch = loss_fn(decoded_batch, image_batch) # loss between final output and original image
# backprop
print("\nBackprop at index", idx, "/", len(dataloader))
optimizer.zero_grad()
loss_batch.backward()
optimizer.step()
# here we are appending the loss of this single batch
losses.append(loss_batch.detach().cpu().numpy())
# taking the average of the losses over a single epoch (so this is a scalar)
avg_loss = np.mean(losses)
return avg_loss
What I do to train the model is simply iterating that function for num_epochs times:
num_epochs = 5
tr_losses = []
for ep in range(num_epochs):
print("EPOCH:", ep+1, "/", num_epochs)
# Training
training_loss = training_single_epoch(encoder=encoder,
decoder=decoder,
loss_fn=loss_function,
device=device,
dataloader=train_dataloader,
optimizer=optim)
print("Training epoch:", ep+1, "/", num_epochs, ". Training loss:", training_loss)
tr_losses.append(training_loss)
# Save network parameters
torch.save(encoder.state_dict(), 'encoder_params2.pth')
torch.save(decoder.state_dict(), 'decoder_params2.pth')
The training takes about 1 hour and 45 minutes and the training loss, as already said, is about 0.185 after the first epoch and then decreases very slowly, and at the end of the 5th epoch it’s around 0.1845.
What is the problem? I really don’t understand what I’m doing wrong.
And also, this is how I retrieve the loss for the validation set (but also for the test set), is it the right way to do it?
# Using the validation set to obtain the distribution of the losses over a set that has not been used for training
losses_validation = []
idx = 0
for sample in validation_dataset:
print("Processing sample", idx, "/", len(validation_dataset))
img = sample[0].unsqueeze(0).to(device) # adding batch dimension
label = sample[1]
encoder.eval()
decoder.eval()
with torch.no_grad():
encoded = encoder(img)
decoded = decoder(encoded)
loss = loss_function(img, decoded)
losses_validation.append(loss)
idx = idx + 1
Another problem is that when I try to do the same with the test set, if it has more than 3500 samples the kernel crashes, I think this is due to the fact that the memory is full.
What am I doiing wrong?
Thank you and sorry for the long text


