How to use parameters from autoencoder to CNN for classification

I am training Autoencoder on images in order to extract best features from it then later use those features in CNN for doing classification. I want to know how to use those extracted features in CNN because I do not want CNN to do that.
Below is the code for Autoencoder

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 20 00:01:03 2019

@author: surekhagaikwad
"""

import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import torchvision
from torchvision import datasets, transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.utils import save_image
import os

num_epochs = 30
batch_size = 128
learning_rate = 1e-3
# Hyperparameters
num_classes = 1

loss_list = []
acc_list = []

if not os.path.exists('./mlp_img'):
    os.mkdir('./mlp_img')


def to_img(x):
    x = x.view(x.size(0), 3, 224, 224)
    return x

def plot_sample_img(img, name):
    img = img.view(1, 28, 28)
    save_image(img, './sample_{}.png'.format(name))
    
    


def min_max_normalization(tensor, min_value, max_value):
    min_tensor = tensor.min()
    tensor = (tensor - min_tensor)
    max_tensor = tensor.max()
    tensor = tensor / max_tensor
    tensor = tensor * (max_value - min_value) + min_value
    return tensor


def tensor_round(tensor):
    return torch.round(tensor)

img_transform = transforms.Compose([
    transforms.Resize([224,224]),
    transforms.Grayscale(3),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Lambda(lambda tensor:min_max_normalization(tensor, 0, 1)),
    transforms.Lambda(lambda tensor:tensor_round(tensor))
])


transform = transforms.Compose([
        transforms.Resize([224,224]),
        #transforms.Grayscale(3),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        #transforms.Normalize((0.5,), (0.5,))
        ])
 
dataset = datasets.ImageFolder(root="./data/", transform=img_transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=20, shuffle=True, num_workers=1)


class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        ## encoder layers ##
        # conv layer (depth from 1 --> 16), 3x3 kernels
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)  
        # conv layer (depth from 16 --> 4), 3x3 kernels
        self.conv2 = nn.Conv2d(16, 4, 3, padding=1)
        # pooling layer to reduce x-y dims by two; kernel and stride of 2
        self.pool = nn.MaxPool2d(2, 2)
        
        ## decoder layers ##
        ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2
        self.t_conv1 = nn.ConvTranspose2d(4, 16, 2, stride=2)
        self.t_conv2 = nn.ConvTranspose2d(16, 3, 2, stride=2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()


    def forward(self, x):
        ## encode ##
        # add hidden layers with relu activation function
        # and maxpooling after
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        # add second hidden layer
        x = self.relu(self.conv2(x))
        x = self.pool(x)  # compressed representation
        
        ## decode ##
        # add transpose conv layers, with relu activation function
        x = self.relu(self.t_conv1(x))
        # output layer (with sigmoid for scaling from 0 to 1)
        x = self.sigmoid(self.t_conv2(x))
                
        return x
    
model = autoencoder()
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train_model():
    for epoch in range(num_epochs):
        for data in dataloader:
           img,_ = data
           output = model(img)
           loss = criterion(output, img)
           optimizer.zero_grad()
           loss.backward()
           optimizer.step()
    # ===================log========================
        print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss.item()))
        if epoch % 10 == 0:
            x = to_img(img.cpu().data)
            x_hat = to_img(output.cpu().data)
            save_image(x, './mlp_img/x_{}.png'.format(epoch))
            save_image(x_hat, './mlp_img/x_hat_{}.png'.format(epoch))

train_model()

1 Like

You could return the “compressed representation” additionally to the last output from your model after training:

    def forward(self, x, return_comp=False):
        ## encode ##
        # add hidden layers with relu activation function
        # and maxpooling after
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        # add second hidden layer
        x = self.relu(self.conv2(x))
        x_comp = self.pool(x)  # compressed representation
        
        ## decode ##
        # add transpose conv layers, with relu activation function
        x = self.relu(self.t_conv1(x_comp))
        # output layer (with sigmoid for scaling from 0 to 1)
        x = self.sigmoid(self.t_conv2(x))
        
        if return_comp:
            return x, x_comp
        else:
            return x

Then just set return_comp=True and store all features using torch.save (with the corresponding classes).
Afterwards, you can setup a new CNN and load this data to train it.

After training how will I send entire data to forward call in order to get x_comp and x because I have loaded images as tensors using dataloader.
x_comp I will use features in CNN but how will I use x as labels since its multidimensional…

Cant we use the encoder part directly in CNN?

You could just pass another epoch using your DataLoader, and store each feature using the corresponding target given by your DataLoader as well (e.g. use the target to select the corresponding folder to store the current feature tensor).

That would be a vanilla CNN, wouldn’t it?

Yeah thats right it doesn’t make sense

I somehow able to do that by having extra epoch cycle. I am inserting each of the target tensors in list and then finally saving in file using torch.save. But when I load the same file I always get only the last tensor…Is it possible to save list of tensors in file using torch.save? Because when I debugged the same list I could see list of tensors being added.

It got resolved please ignore it

I wanted to visualize each of the filters output basically layers output as potential features…how will I do that?

You could register forward hooks to the layers (as described here) and try to visualize the feature maps.

When I try to visualize kernels using code as mentioned in thread Visualize feature map, It works only for first convolutional layer
but when I try for other layers it doesn’t, gives me below error.

For conv2 layer

# kernel dislay
plt.figure(figsize=(20,8))
kernels = model.conv2.weight.detach().clone()
kernels = kernels - kernels.min()
kernels = kernels / kernels.max()
img = make_grid(kernels)
plt.imshow(img.permute(1, 2, 0))

error as
runtimeError: The expanded size of the tensor (3) must match the existing size (64) at non-singleton dimension 0. Target sizes: [3, 3, 3]. Tensor sizes: [64, 3, 3]

Depending on the number of in_channels you cannot visualize the kernels using a standard RGB image.
E.g. if in_channels=64, you could visualize each channel as a grayscale image.
In the example, the user was dealing with 3 input channels and 96 kernels, if I remember the use case correctly.

well that got resolved … thanks :slight_smile:

I am trying to use the x_comp (compressed features from autoencoder) in CNN which has given below architecture.

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        self.conv2 = nn.Conv2d(64, 192, kernel_size=5, padding=2)
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        
        self.conv3 = nn.Conv2d(192, 384, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(384, 256, kernel_size=3, padding=1)
        
        self.conv5 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.pool5 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        self.relu = nn.ReLU()
        
        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(6 * 6 * 256, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.pool1(self.relu(self.conv1(x)))
        out = self.pool2(self.relu(self.conv2(out)))
        out = self.relu(self.conv3(out))
        out = self.relu(self.conv4(out))
        out = self.pool5(self.relu(self.conv5(out)))
        out = out.reshape(out.size(0), -1)
        
        #out = self.drop_out(out)
        out = self.drop_out(out)
        out = self.relu(self.fc1(out))
        out = self.drop_out(out)
        out = self.relu(self.fc2(out))
        out = self.sigmoid(self.fc3(out))

        return out

Now I am trying to freeze convolutional part and using only fully connected layer to perform classification.

model.conv1.weight.requires_grad = False
model.conv2.weight.requires_grad = False
model.conv3.weight.requires_grad = False
model.conv4.weight.requires_grad = False
model.conv5.weight.requires_grad = False

But when I run the code it gives me below error

RuntimeError: Expected 4-dimensional input for 4-dimensional weight 64, but got 1-dimensional input of size [20] instead

which is true because dimensions of x_comp is different. But how will I resolve it.? Does it mean that I need to change the architecture of CNN?

x_comp should still be 4-dimensional, as it should be the activation of a conv layer.
Could you print the shape and the code you’ve used to save these tensors?

Earlier I was saving in the list so that was creating an issue … my bad…now I am saving it directly in file as tensor.

def train_model():
    for epoch in range(num_epochs+1):
        for data in dataloader:
           img,target = data
           if epoch == num_epochs:
               x_comp,output = model(img,return_comp=True)
               #X_comp.append(x_comp)
               #label.append(target)
               torch.save(target, './target.pt')
               torch.save(x_comp, './x_comp.pt')
           else:
               output = model(img)
           loss = criterion(output, img)
           optimizer.zero_grad()
           loss.backward()
           optimizer.step()

This is how I am saving it.

Shape of tensors are

for target

torch.Size([6, 3, 224, 224])

for x_comp

torch.Size([6, 4, 56, 56])

And below is the CNN that I am using to do classification where I have freezed the convolutional layers and using only FC layers.

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        self.conv2 = nn.Conv2d(64, 192, kernel_size=5, padding=2)
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        
        self.conv3 = nn.Conv2d(192, 384, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(384, 256, kernel_size=3, padding=1)
        
        self.conv5 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.pool5 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        self.relu = nn.ReLU()
        
        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(6 * 6 * 256, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.pool1(self.relu(self.conv1(x)))
        out = self.pool2(self.relu(self.conv2(out)))
        out = self.relu(self.conv3(out))
        out = self.relu(self.conv4(out))
        out = self.pool5(self.relu(self.conv5(out)))
        out = out.reshape(out.size(0), -1)
        
        #out = self.drop_out(out)
        out = self.drop_out(out)
        out = self.relu(self.fc1(out))
        out = self.drop_out(out)
        out = self.relu(self.fc2(out))
        out = self.sigmoid(self.fc3(out))

        return out

I am unable to understand how will I use those extracted weights in CNN?

The extracted activations can be used as any conv input, i.e. you can treat them as a new “image tensor” with 4 channels and a spatial size of 56x56.
To do this, you would have to change the number of input channels of the first conv layer of your new model to 4. Also, since your spatial size is now smaller, you should change the number of input features for self.fc1 to 256.
This code should work:

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        self.conv1 = nn.Conv2d(4, 64, kernel_size=11, stride=4, padding=2)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        self.conv2 = nn.Conv2d(64, 192, kernel_size=5, padding=2)
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        
        self.conv3 = nn.Conv2d(192, 384, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(384, 256, kernel_size=3, padding=1)
        
        self.conv5 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.pool5 = nn.MaxPool2d(kernel_size=3, stride=2)
        
        self.relu = nn.ReLU()
        
        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(256, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.pool1(self.relu(self.conv1(x)))
        out = self.pool2(self.relu(self.conv2(out)))
        out = self.relu(self.conv3(out))
        out = self.relu(self.conv4(out))
        out = self.pool5(self.relu(self.conv5(out)))
        out = out.view(out.size(0), -1)
        
        #out = self.drop_out(out)
        out = self.drop_out(out)
        out = self.relu(self.fc1(out))
        out = self.drop_out(out)
        out = self.relu(self.fc2(out))
        out = self.sigmoid(self.fc3(out))

        return out
    
model = ConvNet()
x = torch.randn(6, 4, 56, 56)
out = model(x)

Thanks …one more thing as of now both compressed image data and labels are saved separately…but I want to combine them together in my train_loader (dataloader) so that I can iterate through both of them during training the model… what is the easy process to do that?

Note:- I want to split the compressed data in to train_loader and test_loader

I think the best approach would be to write a custom Dataset taking the paths of the data tensors and targets, split the paths to a training and validation set, and create the training and validation Dataset using the paths subsets.

Hello! I’m just having the same problem with you, could you please tell me how to save the output of the encoder and use it in CNN classification network? Thank you for your help!