Running_mean should contain 1 elements not 256

For context, I’m using this tutorial for Image Captioning GitHub and I’m now trying to fine tune it, but to my dataset.

I just have one image and one caption and I’m trying to train the model to that, 1ºProblem I encountered was because I was using the BatchNorm1d and changed to InstanceNorm1d.
Now the error is the one on the title and this is the code.

The encoder:

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size) = nn.BatchNorm1d(embed_size, momentum=0.01) = nn.InstanceNorm1d(256, track_running_stats=True)

    def forward(self, images):
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features =
        return features

And this is the loader:

class CustomDataset(Dataset):
    def __init__(self, image_folder, caption_folder, transform=None):
        self.image_folder = image_folder
        self.caption_folder = caption_folder
        self.transform = transform

    def __len__(self):
        return 256
        #return len(os.listdir(self.image_folder))
    def __getitem__(self, idx):
        #img_name = os.path.join(self.image_folder, str(idx) + '.jpg')
        img_name = os.path.join(self.image_folder, "0" + '.jpg')
        img =

        transform = transforms.Compose([ 
            transforms.Normalize((0.485, 0.456, 0.406), 
                                (0.229, 0.224, 0.225))])

        img = transform(img)
        img = torch.randn(256, 256)
        img = img.repeat(3, 1, 1)

        def tokenize(caption, vocabulary):
            words = caption.split()
            tokens = [vocabulary(word) for word in words]
            return tokens

        tokenized_captions = tokenize(caption, vocab)

        print("Image shape:", img.shape)

        return img, torch.tensor(tokenized_captions)

dataset = CustomDataset(image_folder='images/',
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, drop_last=False)

The model loader (just in case anyone finds the problem here):

class FineTuneModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(FineTuneModel, self).__init__()
        self.encoder = EncoderCNN(embed_size)
        self.decoder = DecoderRNN(embed_size,
    def forward(self, images, captions, lengths):
        features = self.encoder(images)
        outputs = self.decoder(features, captions, lengths)
        return outputs

For reference it’s I’m trying to train a Gray Scale image on a RGB trained model, but I’ve done the conversion.

Thanks in advance.