Thank you very much for quilkly reply, here is my code.
class PrecompDataset(data.Dataset):
def __init__(self, data_path, data_split, vocab, vocab_tag):
self.vocab = vocab
self.vocab_tag = vocab_tag
loc = data_path + '/'
# Captions
self.captions = []
with open(loc+'%s_caps.txt' % data_split, 'rb') as f:
for line in f:
self.captions.append(line.strip())
# Image features
self.images = np.load(loc+'%s_ims.npy' % data_split)
self.length = len(self.captions)
if self.images.shape[0] != self.length:
self.im_div = 5
else:
self.im_div = 1
# the development set for coco is large and so validation would be slow
if data_split == 'dev':
self.length = 5000
def __getitem__(self, index):
# handle the image redundancy
img_id = index/self.im_div
image = torch.Tensor(self.images[img_id])
caption = self.captions[index]
vocab = self.vocab
vocab_tag = self.vocab_tag
# Convert caption (string) to word ids.
tokens = nltk.tokenize.word_tokenize(
str(caption).lower().decode('utf-8'))
caption = []
caption.append(vocab('<start>'))
caption.extend([vocab(token) for token in tokens])
caption.append(vocab('<end>'))
target = torch.Tensor(caption)
return image, index, img_id
def __len__(self):
return self.length
def collate_fn(data):
'''Build mini-batch tensors from a list of (image, caption) tuples.
'''
# Sort a data list by caption length
data.sort(key=lambda x: len(x[1]), reverse=True)
images, captions, captions_tag, ids, img_ids = zip(*data)
# Merge images (convert tuple of 3D tensor to 4D tensor)
images = torch.stack(images, 0)
# Merget captions (convert tuple of 1D tensor to 2D tensor)
lengths = [len(cap) for cap in captions]
targets = torch.zeros(len(captions), max(lengths)).long()
for i, cap in enumerate(captions):
end = lengths[i]
targets[i, :end] = cap[:end]
return images, targets, captions_tag, lengths, ids
def get_precomp_loader(data_path, data_split, vocab, vocab_tag, opt, batch_size=100,
shuffle=True, num_workers=2):
"""Returns torch.utils.data.DataLoader for custom coco dataset."""
dset = PrecompDataset(data_path, data_split, vocab, vocab_tag)
data_loader = torch.utils.data.DataLoader(dataset=dset,
batch_size=batch_size,
shuffle=shuffle,
pin_memory=True,
collate_fn=collate_fn)
return data_loader
def get_loaders(data_name, vocab, vocab_tag, batch_size, workers, opt):
dpath = os.path.join(opt.data_path, data_name)
train_loader = get_precomp_loader(dpath, 'train', vocab, vocab_tag, opt,
batch_size, True, workers)
val_loader = get_precomp_loader(dpath, 'dev', vocab, opt,
batch_size, False, workers)
return train_loader, val_loader
this is the dataloader, maybe it can be a liitle complex.
Simply, i want to get a batch, in this batch, one image corresponds to 5 captions and one caption_tag.
and i want to print the dataloader beacuse :
someone told me that the output of dataloader will be one image correspond to one caption , but another one tell me it will be one image correspond to 5 captions. i wang to get the correct answer