I literally tried everything.Need help. #Error#. TypeError: conv2d(): argument 'input' (position 1) must be Tensor, not tuple

Define Autoencoder

#dropout_p= 0.5
class Autoencoder(nn.Module):

def __init__(self):
    super(Autoencoder, self).__init__()
    self.encoder = nn.Sequential(
       # nn.Dropout2d(dropout_p= 0.5),
        nn.Conv2d(1, 16,kernel_size=3,stride =1, padding= 1,bias= False),  
        nn.MaxPool2d( 2, 2,return_indices=True),  
        nn.Conv2d(16,32,kernel_size=3,stride =1, padding=1),  
        nn.MaxPool2d(2, 2,return_indices=True),  
        nn.Conv2d(32,64,kernel_size=3,stride =1, padding= 1),  
        nn.MaxPool2d(2, 2,return_indices=True),  
    self.decoder = nn.Sequential(
        nn.MaxUnpool2d(2, 2),
        nn.ConvTranspose2d(64, 32, kernel_size=3), 
        nn.MaxUnpool2d(2, 2),
        nn.ConvTranspose2d(32, 16, kernel_size=3),  
        nn.MaxUnpool2d(2, 2),
        nn.ConvTranspose2d(16, 8, kernel_size=3)

def forward(self, x):
    x = self.encoder(x)
    encoded_x = x
    x = self.decoder(x)
    return x, encoded_x

num_epochs = 5
batch_size = 128
learning_rate = 1e-5
model= Autoencoder()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for data in loader_train_set:
img, label = data
img = img.view(-1,1,320,240)
img = Variable(img)
# img = torch.from_numpy(numpy.array(img))
encoded_output = model.encoder(img)
loss = criterion(encoded_output, img)

print(‘epoch [{}/{}], loss:{:.4f}’.format(epoch+1, num_epochs, loss.data()))


The problem is that you’re using MaxPool2d with return_indices=True.
This means that it will return a tuple with both the value and the indices.
This is not handled by the convolution layer that follows.

Thank you. It worked, but got “RuntimeError: only batches of spatial targets supported (3D tensors) but got targets of dimension: 4” error

This happens in the CrossEntropyLoss right?
This is because the input is not of the right shape. You should check the doc here.
If you want to do classification for each pixel given the score in the channel, you want to transpose the output to be of size nb_samples x nb_classes. And the label should be nb_samples. Where nb_samples = batch_size * height * width.

Can you please write it in a code

It depends a bit on the task you want to do.
if you want cross entropy for each pixel independently:

output_of_net = torch.rand(batch, nb_class, height, width)
label = torch.LongTensor(batch, height, width)

output_of_net_linearized = output_of_net.permute(0, 2, 3, 1).reshape(-1, output_of_net.size(1))
label_linearized = label.view(-1)
loss = criterion(output_of_net_linearized, label_linearized)