For context, I’m using this tutorial for Image Captioning GitHub and I’m now trying to fine tune it, but to my dataset.
I just have one image and one caption and I’m trying to train the model to that, 1ºProblem I encountered was because I was using the BatchNorm1d and changed to InstanceNorm1d.
Now the error is the one on the title and this is the code.
The encoder:
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
super(EncoderCNN, self).__init__()
resnet = models.resnet152(pretrained=True)
modules = list(resnet.children())[:-1]
self.resnet = nn.Sequential(*modules)
self.linear = nn.Linear(resnet.fc.in_features, embed_size)
#self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
self.bn = nn.InstanceNorm1d(256, track_running_stats=True)
def forward(self, images):
with torch.no_grad():
features = self.resnet(images)
features = features.reshape(features.size(0), -1)
features = self.bn(self.linear(features))
return features
And this is the loader:
class CustomDataset(Dataset):
def __init__(self, image_folder, caption_folder, transform=None):
self.image_folder = image_folder
self.caption_folder = caption_folder
self.transform = transform
def __len__(self):
return 256
#return len(os.listdir(self.image_folder))
def __getitem__(self, idx):
#img_name = os.path.join(self.image_folder, str(idx) + '.jpg')
img_name = os.path.join(self.image_folder, "0" + '.jpg')
img = Image.open(img_name)
transform = transforms.Compose([
transforms.Resize((256,256)),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
img = transform(img)
img = torch.randn(256, 256)
img.unsqueeze_(0)
img = img.repeat(3, 1, 1)
def tokenize(caption, vocabulary):
words = caption.split()
tokens = [vocabulary(word) for word in words]
return tokens
tokenized_captions = tokenize(caption, vocab)
print("Image shape:", img.shape)
return img, torch.tensor(tokenized_captions)
dataset = CustomDataset(image_folder='images/',
caption_folder='captions/',
transform=transforms.ToTensor())
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, drop_last=False)
The model loader (just in case anyone finds the problem here):
class FineTuneModel(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
super(FineTuneModel, self).__init__()
self.encoder = EncoderCNN(embed_size)
self.decoder = DecoderRNN(embed_size,
hidden_size,
vocab_size,
num_layers,
max_seq_length=20)
def forward(self, images, captions, lengths):
features = self.encoder(images)
outputs = self.decoder(features, captions, lengths)
return outputs
For reference it’s I’m trying to train a Gray Scale image on a RGB trained model, but I’ve done the conversion.
Thanks in advance.