In detail
I try image captioning to Instagram dataset. But image captioning didn’t work well. The predict image caption is . I try some revise the code but it also didn’t work. Why it is not work well?
Code
import pandas as pd
import csv
link='/content/drive/MyDrive/input/instagram/instagram_data/captions_csv.csv'
df = pd.read_csv(link,encoding='utf-8')
df = df.dropna(axis=0)
df.drop_duplicates(subset=None,
keep='first',
inplace=False,
ignore_index=False)
df['Caption']= df['Caption'].str.replace(pat=r'[^\w]',repl=r' ',regex=True)
df.to_csv('/content/drive/MyDrive/input/instagram/instagram_data/captions_csv_2.txt',index=False)
data_location = "./input/instagram"
#imports
import numpy as np
import torch
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as T
#imports
import os
from collections import Counter
import numpy as np
import pandas as pd
import spacy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as T
from PIL import Image
class Vocabulary:
#tokenizer
spacy_eng = spacy.load("en_core_web_sm")
def __init__(self,freq_threshold):
#setting the pre-reserved tokens int to string tokens
self.itos = {0:"<PAD>",1:"<SOS>",2:"<EOS>",3:"<UNK>"}
#string to int tokens
#its reverse dict self.itos
self.stoi = {v:k for k,v in self.itos.items()}
self.freq_threshold = freq_threshold
def __len__(self): return len(self.itos)
@staticmethod
def tokenize(text):
return [token.text.lower() for token in Vocabulary.spacy_eng.tokenizer(text)]
def build_vocab(self, sentence_list):
frequencies = Counter()
#staring index 4
idx = 4
for sentence in sentence_list:
for word in self.tokenize(sentence):
frequencies[word] += 1
#add the word to the vocab if it reaches minum frequecy threshold
if frequencies[word] == self.freq_threshold:
self.stoi[word] = idx
self.itos[idx] = word
idx += 1
def numericalize(self,text):
""" For each word in the text corresponding index token for that word form the vocab built as list """
tokenized_text = self.tokenize(text)
return [ self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] for token in tokenized_text ]
class InstaDataset(Dataset):
"""
InstaDataset
"""
def __init__(self,root_dir,caption_file,transform=None,freq_threshold=5):
self.root_dir = root_dir
self.transform = transform
self.df=pd.read_csv(caption_file)
#Get image and caption colum from the dataframe
self.imgs = self.df["Image File"]
self.captions = self.df["Caption"]
#Initialize vocabulary and build vocab
self.vocab = Vocabulary(freq_threshold)
self.vocab.build_vocab(self.captions.tolist())
def rmEmoji(inputData):
return inputData.encode('utf-8', 'ignore').decode('utf-8')
def __len__(self):
return len(self.df)
def __getitem__(self,idx):
caption = self.captions[idx]
img_name = "/"+self.imgs[idx]+".jpg"
img_location = self.root_dir+img_name
img = Image.open(img_location).convert("RGB")
#apply the transfromation to the image
if self.transform is not None:
img = self.transform(img)
#numericalize the caption text
caption_vec = []
caption_vec += [self.vocab.stoi["<SOS>"]]
caption_vec += self.vocab.numericalize(caption)
caption_vec += [self.vocab.stoi["<EOS>"]]
return img, torch.tensor(caption_vec)
class CapsCollate:
"""
Collate to apply the padding to the captions with dataloader
"""
def __init__(self,pad_idx,batch_first=False):
self.pad_idx = pad_idx
self.batch_first = batch_first
def __call__(self,batch):
imgs = [item[0].unsqueeze(0) for item in batch]
imgs = torch.cat(imgs,dim=0)
targets = [item[1] for item in batch]
targets = pad_sequence(targets, batch_first=self.batch_first, padding_value=self.pad_idx)
return imgs,targets
def get_data_loader(dataset,batch_size,shuffle=False,num_workers=1):
"""
Returns torch dataloader for the Insta dataset
Parameters
-----------
dataset: InstaDataset
custom torchdataset named InstaDataset
batch_size: int
number of data to load in a particular batch
shuffle: boolean,optional;
should shuffle the datasests (default is False)
num_workers: int,optional
numbers of workers to run (default is 1)
"""
pad_idx = dataset.vocab.stoi["<PAD>"]
collate_fn = CapsCollate(pad_idx=pad_idx,batch_first=True)
data_loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
shuffle=shuffle,
num_workers=num_workers,
collate_fn=collate_fn
)
return data_loader
BATCH_SIZE = 32
# BATCH_SIZE = 6
NUM_WORKER = 4
#defining the transform to be applied
transforms = T.Compose([
T.Resize(226),
T.RandomCrop(224),
T.ToTensor(),
T.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
])
#testing the dataset class
dataset = InstaDataset(
root_dir = "/content/drive/MyDrive/input/instagram/instagram_data/instagram_data",
caption_file = "/content/drive/MyDrive/input/instagram/instagram_data/captions_csv_2.txt",
transform=transforms
)
#writing the dataloader
data_loader = get_data_loader(
dataset=dataset,
batch_size=BATCH_SIZE,
num_workers=NUM_WORKER,
shuffle=True,
# batch_first=False
)
#vocab_size
vocab_size = len(dataset.vocab)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class InstaDataset(Dataset):
"""
InstaDataset
"""
def __init__(self,root_dir,caption_file,transform=None,freq_threshold=5):
self.root_dir = root_dir
self.transform = transform
self.df=pd.read_csv(caption_file)
#Get image and caption colum from the dataframe
self.imgs = self.df["Image File"]
self.captions = self.df["Caption"]
#Initialize vocabulary and build vocab
self.vocab = Vocabulary(freq_threshold)
self.vocab.build_vocab(self.captions.tolist())
def rmEmoji(inputData):
return inputData.encode('utf-8', 'ignore').decode('utf-8')
def __len__(self):
return len(self.df)
def __getitem__(self,idx):
caption = self.captions[idx]
img_name = "/"+self.imgs[idx]+".jpg"
img_location = self.root_dir+img_name
img = Image.open(img_location).convert("RGB")
#apply the transfromation to the image
if self.transform is not None:
img = self.transform(img)
#numericalize the caption text
caption_vec = []
caption_vec += [self.vocab.stoi["<SOS>"]]
caption_vec += self.vocab.numericalize(caption)
caption_vec += [self.vocab.stoi["<EOS>"]]
return img, torch.tensor(caption_vec)
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as T
class EncoderCNN(nn.Module):
def __init__(self):
super(EncoderCNN, self).__init__()
resnet = models.resnet50(pretrained=True)
for param in resnet.parameters():
param.requires_grad_(False)
modules = list(resnet.children())[:-2]
self.resnet = nn.Sequential(*modules)
def forward(self, images):
features = self.resnet(images) #(batch_size,2048,7,7)
features = features.permute(0, 2, 3, 1) #(batch_size,7,7,2048)
features = features.view(features.size(0), -1, features.size(-1)) #(batch_size,49,2048)
return features
#Bahdanau Attention
class Attention(nn.Module):
def __init__(self, encoder_dim,decoder_dim,attention_dim):
super(Attention, self).__init__()
self.attention_dim = attention_dim
self.W = nn.Linear(decoder_dim,attention_dim)
self.U = nn.Linear(encoder_dim,attention_dim)
self.A = nn.Linear(attention_dim,1)
def forward(self, features, hidden_state):
u_hs = self.U(features) #(batch_size,num_layers,attention_dim)
w_ah = self.W(hidden_state) #(batch_size,attention_dim)
combined_states = torch.tanh(u_hs + w_ah.unsqueeze(1)) #(batch_size,num_layers,attemtion_dim)
attention_scores = self.A(combined_states) #(batch_size,num_layers,1)
attention_scores = attention_scores.squeeze(2) #(batch_size,num_layers)
alpha = F.softmax(attention_scores,dim=1) #(batch_size,num_layers)
attention_weights = features * alpha.unsqueeze(2) #(batch_size,num_layers,features_dim)
attention_weights = attention_weights.sum(dim=1) #(batch_size,num_layers)
return alpha,attention_weights
#Attention Decoder
class DecoderRNN(nn.Module):
def __init__(self,embed_size, vocab_size, attention_dim,encoder_dim,decoder_dim,drop_prob=0.3):
super().__init__()
#save the model param
self.vocab_size = vocab_size
self.attention_dim = attention_dim
self.decoder_dim = decoder_dim
self.embedding = nn.Embedding(vocab_size,embed_size)
self.attention = Attention(encoder_dim,decoder_dim,attention_dim)
self.init_h = nn.Linear(encoder_dim, decoder_dim)
self.init_c = nn.Linear(encoder_dim, decoder_dim)
self.lstm_cell = nn.LSTMCell(embed_size+encoder_dim,decoder_dim,bias=True)
self.f_beta = nn.Linear(decoder_dim, encoder_dim)
self.fcn = nn.Linear(decoder_dim,vocab_size)
self.drop = nn.Dropout(drop_prob)
def forward(self, features, captions):
#vectorize the caption
embeds = self.embedding(captions)
# Initialize LSTM state
h, c = self.init_hidden_state(features) # (batch_size, decoder_dim)
#get the seq length to iterate
seq_length = len(captions[0])-1 #Exclude the last one
batch_size = captions.size(0)
num_features = features.size(1)
preds = torch.zeros(batch_size, seq_length, self.vocab_size).to(device)
alphas = torch.zeros(batch_size, seq_length,num_features).to(device)
for s in range(seq_length):
alpha,context = self.attention(features, h)
lstm_input = torch.cat((embeds[:, s], context), dim=1)
h, c = self.lstm_cell(lstm_input, (h, c))
output = self.fcn(self.drop(h))
preds[:,s] = output
alphas[:,s] = alpha
return preds, alphas
def generate_caption(self,features,max_len=20,vocab=None):
# Inference part
# Given the image features generate the captions
print('features'+features)
batch_size = features.size(0)
h, c = self.init_hidden_state(features) # (batch_size, decoder_dim)
alphas = []
#starting input
word = torch.tensor(vocab.stoi['<SOS>']).view(1,-1).to(device)
embeds = self.embedding(word)
print('word'+word)
print('embeds'+embeds)
captions = []
for i in range(max_len):
alpha,context = self.attention(features, h)
print('context'+context)
#store the apla score
alphas.append(alpha.cpu().detach().numpy())
lstm_input = torch.cat((embeds[:, 0], context), dim=1)
h, c = self.lstm_cell(lstm_input, (h, c))
output = self.fcn(self.drop(h))
output = output.view(batch_size,-1)
print('output'+output)
#select the word with most val
predicted_word_idx = output.argmax(dim=1)
#save the generated word
captions.append(predicted_word_idx.item())
#end if <EOS detected>
if vocab.itos[predicted_word_idx.item()] == "<EOS>":
break
#send generated word as the next caption
embeds = self.embedding(predicted_word_idx.unsqueeze(0))
#covert the vocab idx to words and return sentence
return [vocab.itos[idx] for idx in captions],alphas
def init_hidden_state(self, encoder_out):
mean_encoder_out = encoder_out.mean(dim=1)
h = self.init_h(mean_encoder_out) # (batch_size, decoder_dim)
c = self.init_c(mean_encoder_out)
return h, c
class EncoderDecoder(nn.Module):
def __init__(self,embed_size, vocab_size, attention_dim,encoder_dim,decoder_dim,drop_prob=0.3):
super().__init__()
self.encoder = EncoderCNN()
self.decoder = DecoderRNN(
embed_size=embed_size,
vocab_size = len(dataset.vocab),
attention_dim=attention_dim,
encoder_dim=encoder_dim,
decoder_dim=decoder_dim
)
def forward(self, images, captions):
features = self.encoder(images)
outputs = self.decoder(features, captions)
return outputs
embed_size=300
vocab_size = len(dataset.vocab)
attention_dim=256
encoder_dim=2048
decoder_dim=512
learning_rate = 3e-4
model = EncoderDecoder(
embed_size=300,
vocab_size = len(dataset.vocab),
attention_dim=256,
encoder_dim=2048,
decoder_dim=512
).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
num_epochs = 25
print_every = 100
for epoch in range(1,num_epochs+1):
for idx, (image, captions) in enumerate(iter(data_loader)):
image,captions = image.to(device),captions.to(device)
# Zero the gradients.
optimizer.zero_grad()
# Feed forward
outputs,attentions = model(image, captions)
# Calculate the batch loss.
targets = captions[:,1:]
loss = criterion(outputs.view(-1, vocab_size), targets.reshape(-1))
# Backward pass.
loss.backward()
# Update the parameters in the optimizer.
optimizer.step()
if (idx+1)%print_every == 0:
print("Epoch: {} loss: {:.5f}".format(epoch,loss.item()))
#generate the caption
model.eval()
with torch.no_grad():
dataiter = iter(data_loader)
img,_ = next(dataiter)
features = model.encoder(img[0:1].to(device))
caps,alphas = model.decoder.generate_caption(features,vocab=dataset.vocab)
caption = ' '.join(caps)
show_image(img[0],title=caption)
model.train()
#save the latest model
save_model(model,epoch)