import os # when loading file paths
import pandas as pd # for lookup in annotation file
import spacy # for tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence # pad batch
from torch.utils.data import DataLoader, Dataset
from PIL import Image # Load img
import torchvision.transforms as transforms
class FlickrDataset(Dataset):
def __init__(self, root_dir, captions_file, transform=None):
self.root_dir = root_dir
self.df = pd.read_csv(captions_file)
self.transform = transform
# Get img, caption columns
self.imgs = self.df["filename"] #image
self.captions = self.df["impression"] #caption
def __len__(self):
return len(self.df)
def __getitem__(self, index):
caption = self.captions[index]
img_id = self.imgs[index]
img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
if self.transform is not None:
img = self.transform(img)
return img, caption
def get_loader(root_folder,annotation_file,transform,batch_size=4,num_workers=1,shuffle=True,pin_memory=True,):
dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=shuffle,
pin_memory=pin_memory,
collate_fn=MyCollate())
return loader, dataset
class MyCollate:
def __init__(self):
self.tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
def __call__(self, batch):
imgs = [item[0].unsqueeze(0) for item in batch]
imgs = torch.cat(imgs, dim=0)
targets = [item[1] for item in batch]
try:
targets = self.tokenizer.batch_encode_plus(targets, padding=True)['input_ids']
for row in targets:
row.append(tokenizer.eos_token_id)
except Exception as e :
print(targets)
return imgs, torch.transpose(torch.tensor(targets), 0, 1)
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
super(DecoderRNN, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.bilstm = nn.LSTM(embed_size, hidden_size, num_layers, bidirectional=True)
self.linear = nn.Linear(2 * hidden_size, vocab_size)
self.dropout = nn.Dropout(0.5)
def forward(self, features, captions): #(25, 16)
embeddings = self.dropout(self.embed(captions)) #(25, 16, 50427)
embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
hiddens, _ = self.bilstm(embeddings)
outputs = self.linear(hiddens)
return outputs
class CNNtoRNN(nn.Module):
def __init__(self, embed_size, hidden_size, num_layers, vocab_size): #(16,3,224,224) / (25,16)
super(CNNtoRNN, self).__init__()
self.encoderCNN = CheXNet(embed_size) #(16, 512)
self.decoderRNN = DecoderRNN(embed_size, hidden_size, num_layers, vocab_size)
self.tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
def forward(self, images, captions):
#self.encoderCNN.eval()
features = self.encoderCNN(images)
outputs = self.decoderRNN(features, captions)
return outputs
def caption_image(self, image, max_length=30):
result_caption = []
with torch.no_grad():
x = self.encoderCNN(image).unsqueeze(0)
states = None
flag = False
for _ in range(max_length):
hiddens, states = self.decoderRNN.bilstm(x, states)
output = self.decoderRNN.linear(hiddens.squeeze(0))
predicted = output.argmax(1)
result_caption.append(predicted.item())
x = self.decoderRNN.embed(predicted).unsqueeze(0)
if self.tokenizer.decode(predicted.item()) == "</s>": #eos: </s>
if flag:
break
else:
flag = True
return [self.tokenizer.decode(result_caption)]
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from torchtext.data.metrics import bleu_score
def train():
transform = transforms.Compose(
[
transforms.Resize((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
train_loader, dataset = get_loader(
root_folder="/content/images/images_normalized",
annotation_file="/content/cleaned_text_file.txt",
transform=transform,
batch_size = 8,
num_workers=0,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True
train_CNN = True
# Hyperparameters
embed_size = 784
hidden_size = 784
vocab_size = 50257
num_layers = 3
learning_rate = 0.0001
num_epochs = 20
step = 0
# initialize model, loss etc
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index= 1) #ignore the index of padding <PAD>
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
if load_model:
step = load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
model.train()
for epoch in range(num_epochs):
# Uncomment the line below to see a couple of test cases
# print_examples(model, device, dataset)
if save_model:
checkpoint = {
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
"step": step,
}
save_checkpoint(checkpoint)
#(imgs, captions) = next(iter(train_loader))
for idx, (imgs, captions) in tqdm(enumerate(train_loader), total=len(train_loader), leave=True):
imgs = imgs.to(device)
captions = captions.to(device)
outputs = model(imgs, captions[:-1]) #captions[:-1]
loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1))
step += 1
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Epoch number {} ----> {}.'.format(epoch,loss))
I am using BioGpt tokenizer so its prebuilt, <PAD>
is 1 and <EOS>
is not defined so I am using <SOS>
also there is outputs = model(imgs, captions[:-1]) which means Im not passing the eos, Im leaving the model to predict the eos by itself.
Please notice criterion = nn.CrossEntropyLoss(ignore_index= 1) #ignore the index of padding <PAD>
, so its not an issue.
UPDATE: I have tried to overfitt my model on one sample by running 500 epochs on it and try to print the prediction on the same sample each epoch, the first epoch was random I got “[‘zymosan zymosan hr restitution p50 p50 p50 p50 p50 tubation Đzymosan zymosan unpublished’]” and after this epoch, all predictions were ['</s></s>']
So I think my model is not learning, does anyone has an idea why ?