Hi there, I’m very new to torchtext
module. I would like to load my own (image, caption) pairs and transform the text caption to tensor as LSTM input.
imgdir
is a folder with .jpg
image files. txtdir
is a folder with .txt
files and each .txt
file is one line of sentence.
First I’d like to transform each text sentence to a valid input to LSTM. Which steps should I use, like tokenize, vocabulary dictionary, word embeddings? I’m not clear about this process. Could you kindly please show me some examples? Or where can I find some tutorials?
Second is about data loader. My code is as follows (text is still represented as an alphabet vector):
import glob, os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image
MEAN = np.asarray([0.4914, 0.4822, 0.4465])
STD = np.asarray([0.2023, 0.1994, 0.2010])
NRM = transforms.Normalize(MEAN, STD)
TT = transforms.ToTensor()
TPIL = transforms.ToPILImage()
transform_no_aug = transforms.Compose([TT, NRM])
class DatasetMaker(Dataset):
def __init__(self, txtdir, imgdir, transformFunc=transform_no_aug):
'''
:param txtdir: .txt files in txtdir, each is one line, lowerized, all in alphabet, and len<MAXLEN
:param imgdir:
:return self.img_caption_pairs is [(np.ndarray_uint8_640x480x3, np.ndarray_uint8_(MAXLEN,))]
'''
self.MAXLEN = 400
self.transformFunc = transformFunc
self.alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} "
self.alphabetDict = {self.alphabet[idx]: idx + 1 for idx in range(len(self.alphabet))}
self.txtnames = [os.path.basename(path) for path in glob.glob(f'{txtdir}/*.txt')]
self.length = len(self.txtnames)
self.img_caption_pairs = []
for txtname in self.txtnames:
imgname = os.path.join(imgdir, txtname.replace('txt', 'jpg'))
with open(os.path.join(txtdir, txtname), 'r') as fin:
caption_text = fin.readline().strip()
# image = np.asarray(Image.open(imgname), dtype=np.uint8)
image = Image.open(imgname)
caption_mat = self.txt2mat(caption_text)
self.img_caption_pairs.append((image, caption_mat))
def __getitem__(self, i):
image, caption_mat = self.img_caption_pairs[i]
image = self.transformFunc(image)
caption_mat = torch.from_numpy(caption_mat)
return image, caption_mat
def __len__(self):
return sum(self.length)
def txt2mat(self, caption_text):
char_mat = np.zeros((self.MAXLEN), dtype=np.uint8)
caparr = np.array([self.alphabetDict[char] for char in caption_text], dtype=np.uint8)
char_mat[:len(caparr)] = caparr
return char_mat
How can I optimize this part of code?
Any possible help is appreciated. Thx!