For my project, I am attempting to write an autoencoder, where the input and output grayscale images are slightly different. At first, I thought my conv net was not working, so I tried to have the autoencoder recreate the original input, but no matter what, the autoencoder on returns an gray image.
I’m currently using my custom dataset, which I wrote with @ptrblck help, but when testing if my conv net has issues, I pass the x(input) as both x(input) and y(output).
Here is my dataloading. My grayscale images are original 0 to 255, but I have read that Transform.toTensor(), automatically scales between 0 and 1, so I don’t need to change the input.
transform = transforms.Compose(
[transforms.ToTensor()])
# load the training and test datasets
class MyDataset():
def __init__(self, csv_file,transform=None):
self.image_paths = pd.read_csv(csv_file, header = 0)
self.transform = transform
def __getitem__(self, index):
#print(self.image_paths[index])
#image_transformed = load_image(self.image_paths[index])
#print(index)
#print(self.image_paths.loc[[index]])
current = self.image_paths.iloc[index]
#print(current.shape)
#image_transformed = current.iloc[1]
image = Image.open(current.iloc[0])
image_transformed = Image.open(current.iloc[1])
#image, image_transformed = load_image(self.image_paths[index])
# transformations, e.g. Random Crop etc.
# Make sure to perform the same transformations on image and target
# Here is a small example: https://discuss.pytorch.org/t/torchvision-transfors-how-to-perform-identical-transform-on-both-image-and-target/10606/7?u=ptrblck
#x, y = TF.to_tensor(image), TF.to_tensor(image_transformed)
x = torch.from_numpy(np.array(image))
y = torch.from_numpy(np.array(image_transformed))
return x, y
def __len__(self):
return len(self.image_paths)
# In[ ]:
# Create training and test dataloaders
num_workers = 0
# how many samples per batch to load
batch_size = 10
# prepare data loaders
train_loader = torch.utils.data.DataLoader(MyDataset("./train.csv"), batch_size=batch_size, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(MyDataset("./test.csv"), batch_size=batch_size, num_workers=num_workers)
For reading the shape, it seems like the grayscale image returns as a (width,height), then, I reshape the 2d image to a 4d input that I can feed into the neural network (1,1,width,height).
Here is my convnet
class ConvAutoencoder(nn.Module):
def __init__(self):
super(ConvAutoencoder, self).__init__()
## encoder layers ##
# conv layer (depth from 1 --> 16), 3x3 kernels
self.conv1 = nn.Conv2d(1, 4, 64, stride=1, padding=1)
# conv layer (depth from 16 --> 4), 3x3 kernels
self.conv2 = nn.Conv2d(4, 4, 64, stride=1, padding=1)
self.conv3 = nn.Conv2d(4, 4, 64, stride=1, padding=1)
self.conv4 = nn.Conv2d(4, 3, 64, stride=1, padding=1)
self.conv5 = nn.Conv2d(3, 2, 64, stride=1, padding=1)
self.conv6 = nn.Conv2d(2, 2, 64, stride=1, padding=1)
# pooling layer to reduce x-y dims by two; kernel and stride of 2
## decoder layers ##
## a kernel of 2 and a stride of 2 will increase the spatial dims by 2
self.t_conv6 = nn.ConvTranspose2d(2, 2, 64, stride=1, padding=1)
self.t_conv5 = nn.ConvTranspose2d(2, 3, 64, stride=1, padding=1)
self.t_conv4 = nn.ConvTranspose2d(3, 4, 64, stride=1, padding=1)
self.t_conv3 = nn.ConvTranspose2d(4, 4, 64, stride=1, padding=1)
self.t_conv2 = nn.ConvTranspose2d(4, 2, 64, stride=1, padding=1)
self.t_conv1 = nn.ConvTranspose2d(2, 1, 64, stride=1, padding=1)
def forward(self, x):
## encode ##
# add hidden layers with relu activation function
# and maxpooling after
x = F.relu(self.conv1(x))
# add second hidden layer
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
x = F.relu(self.conv5(x))
x = F.relu(self.conv6(x))
## decode ##
# add transpose conv layers, with relu activation function
x = F.relu(self.t_conv6(x))
x = F.relu(self.t_conv5(x))
x = F.relu(self.t_conv4(x))
x = F.relu(self.t_conv3(x))
x = F.relu(self.t_conv2(x))
# output layer (with sigmoid for scaling from 0 to 1)
x = F.sigmoid(self.t_conv1(x))
return x
At first, I thought that my convnet wasn’t deep enough, so I added more layers and increased the kernel size, but this didn’t have loss very much.
Next, my training loop
for epoch in range(10):
count_train = 0
count_test = 0
for step, (x, y) in enumerate(train_loader):
b_x = x # batch x, shape (batch, 900*900)
b_y = x # batch y, shape (batch, 900*900)
running_loss = 0.0
for i in range(len(x)): # breaking up into mini batches, where each iteration is input
xi = x[i, ...][None, ...]
yi = y[i, ...][None, ...]
xi = xi[None, :, :, :]
yi = yi[None, :, :, :]
xi = xi.float().to(device)
yi = yi.float().to(device)
decoded = model(xi)
decoded = decoded.float()*255
#print(decoded)
loss = loss_func(decoded, yi) # mean square error
optimizer.zero_grad()
loss.backward() # backpropagation, compute gradients
optimizer.step()
count_train = count_train + 1 # apply gradients
running_loss += loss.item()
for step, (x, y) in enumerate(test_loader):
b_x = x.to(device) # batch x, shape (batch, 900*900)
b_y = x.to(device) # batch y, shape (batch, 900*900)
test_loss = 0.0
for i in range(len(x)): # breaking up into mini batches, where each iteration is input
xi = x[i, ...][None, ...]
yi = y[i, ...][None, ...]
xi = xi[None, :, :, :]
yi = yi[None, :, :, :]
xi = xi.float().to(device)
yi = yi.float().to(device)
decoded = model(xi)
decoded = decoded.float()*255
loss = loss_func(decoded, yi) # mean square error
optimizer.zero_grad()
loss.backward() # backpropagation, compute gradients
optimizer.step()
count_test = count_test + 1 # apply gradients
# apply gradients
test_loss += loss.item()
if ((epoch+1)%5) == 0:
torch.save(model.state_dict(), "./model/model-epoch-" + str(epoch+1) + ".pth")
print('Epoch: ', epoch + 1, '| train loss: ' + str(running_loss/count_train), ' | test loss ' + str(test_loss/count_test))
df.loc[epoch+1] = [epoch + 1,running_loss, test_loss]
For my training loop, first, I reshape the 2d input to a 4d input, from each image, individually. Then I update the gradients, then run the test lost, on the test dataloader. I then save the model every 5 epochs.
Does anyone see than issue with my training code? I think that I may have had an issue with inference code.
model = ConvAutoencoder().to(device)
model.load_state_dict(torch.load("./model/model-epoch-10.pth"))
model.eval()
image = Image.open("./low_res/depth-map-low-0.png")
image = np.array(image)
#image_transformed = Image.open(current.iloc[1])
def scale(X, x_min, x_max):
nom = (X-X.min(axis=0))*(x_max-x_min)
denom = X.max(axis=0) - X.min(axis=0)
denom[denom==0] = 1
return x_min + nom/denom
x = scale(image,0,1)
x = torch.from_numpy(x)
print(x.max())
xi = x[None ,None, :, :]
xi = xi.float().to(device)
output = model(xi)
output = output.float()*255
output = output.detach().cpu().numpy() # detach since the variable has gradients
print(output.shape)
output=np.squeeze(output, axis=None)
print(output.shape)
#output = NormalizeData(output)
print(output)
from matplotlib import pyplot as plt
plt.imshow(output, interpolation='nearest')
plt.show()
In my code, first, I load the Convolutional Autoencoder from a model file, then set the model as eval. Since the grayscale image is from 0 to 255, I first scale from 0 to 1 with min-max scaling, since during training, the toTensor Transform scales automatically to 0 to 1. I think convert from numpy to torch, reshape to a 4d, and pass through the network. I multiply the output by 255 to scale from 0 to 255, then squeeze to get rid of the batch size and grayscale dimension. I then move the tensor to cpu, convert to a numpy array, where it is shown as an image.
However, despite this, my network still only outputs images of a single shade when inferencing. Is there an error in the code somewhere where I missed? If needed, you can reproduce my results by downloading the source code from drive below. Thank you for your time.
Here is a link to my entire project: https://drive.google.com/drive/folders/1mwep_CMZfZI65BJXWqgjjSRU9FBGsSUg?usp=sharing