How to load (image, caption) pairs and transform text to LSTM input?

Hi there, I’m very new to torchtext module. I would like to load my own (image, caption) pairs and transform the text caption to tensor as LSTM input.

imgdir is a folder with .jpg image files. txtdir is a folder with .txt files and each .txt file is one line of sentence.

First I’d like to transform each text sentence to a valid input to LSTM. Which steps should I use, like tokenize, vocabulary dictionary, word embeddings? I’m not clear about this process. Could you kindly please show me some examples? Or where can I find some tutorials?

Second is about data loader. My code is as follows (text is still represented as an alphabet vector):

import glob, os
import torch
import torchvision.transforms as transforms
from import Dataset, DataLoader
import numpy as np
from PIL import Image
MEAN = np.asarray([0.4914, 0.4822, 0.4465])
STD = np.asarray([0.2023, 0.1994, 0.2010])
NRM = transforms.Normalize(MEAN, STD)
TT = transforms.ToTensor()
TPIL = transforms.ToPILImage()
transform_no_aug   = transforms.Compose([TT, NRM])

class DatasetMaker(Dataset):
    def __init__(self, txtdir, imgdir, transformFunc=transform_no_aug):
        :param txtdir: .txt files in txtdir, each is one line, lowerized, all in alphabet, and len<MAXLEN
        :param imgdir:
        :return self.img_caption_pairs is [(np.ndarray_uint8_640x480x3, np.ndarray_uint8_(MAXLEN,))]
        self.MAXLEN = 400
        self.transformFunc = transformFunc
        self.alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} "
        self.alphabetDict = {self.alphabet[idx]: idx + 1 for idx in range(len(self.alphabet))}
        self.txtnames = [os.path.basename(path) for path in glob.glob(f'{txtdir}/*.txt')]
        self.length = len(self.txtnames)
        self.img_caption_pairs = []
        for txtname in self.txtnames:
            imgname = os.path.join(imgdir, txtname.replace('txt', 'jpg'))
            with open(os.path.join(txtdir, txtname), 'r') as fin:
                caption_text = fin.readline().strip()
            # image = np.asarray(, dtype=np.uint8)
            image =
            caption_mat = self.txt2mat(caption_text)
            self.img_caption_pairs.append((image, caption_mat))

    def __getitem__(self, i):
        image, caption_mat = self.img_caption_pairs[i]
        image = self.transformFunc(image)
        caption_mat = torch.from_numpy(caption_mat)
        return image, caption_mat

    def __len__(self):
        return sum(self.length)

    def txt2mat(self, caption_text):
        char_mat = np.zeros((self.MAXLEN), dtype=np.uint8)
        caparr = np.array([self.alphabetDict[char] for char in caption_text], dtype=np.uint8)
        char_mat[:len(caparr)] = caparr
        return char_mat

How can I optimize this part of code?

Any possible help is appreciated. Thx!