I have written this dataset
class Classification(Dataset):
def __init__(self,image_paths,targets,resize=None):
self.image_paths = image_paths
self.targets = targets
self.resize = resize
self.aug = al.Compose([al.Normalize(always_apply=True)])
def __len__(self):
return len(self.image_paths)
def __getitem__(self,position):
image = Image.open(self.image_paths[position]).convert("RGB")
targets = self.targets[position]
st1,st2 = "",""
words = []
with open(targets,"r") as t:
lines = t.readlines()
for line in reversed(lines):
st1 = line
st2 = st1 + st2
words.insert(0,st2)
#words = [ord(c) for c in str(l[0])]
if self.resize is not None:
image = image.resize((self.resize[1],self.resize[0]),resample=Image.BILINEAR)
image = np.array(image)
augmented = self.aug(image=image)
image = augmented["image"]
image = np.transpose(image,(2,0,1)).astype(np.float32)
print(torch.tensor(image,dtype=torch.float).shape)
return{
"images" : torch.tensor(image,dtype=torch.float),
"targets" : words
}
this return two things
-
a tensor of size (3,120,480)
and I have kept image size as wxh = 480x120
-
the text from the corresponding txt file
{'images': tensor([[[1.3927, 0.6392, 0.1768, ..., 2.2489, 2.2489, 2.2489],
[1.4098, 0.6906, 0.1597, ..., 2.2489, 2.2489, 2.2489],
[1.2899, 0.6392, 0.0741, ..., 2.2489, 2.2489, 2.2489],
...,
[1.2557, 0.5707, 1.9407, ..., 2.2147, 2.2147, 2.2147],
[1.1700, 0.6392, 1.9578, ..., 2.2147, 2.2147, 2.2147],
[0.7248, 0.1426, 1.2557, ..., 2.2147, 2.2147, 2.2147]],
[[1.5532, 0.7829, 0.3102, ..., 2.4286, 2.4286, 2.4286],
[1.5707, 0.8354, 0.2927, ..., 2.4286, 2.4286, 2.4286],
[1.4482, 0.7829, 0.2052, ..., 2.4286, 2.4286, 2.4286],
...,
[1.4132, 0.7129, 2.1134, ..., 2.3936, 2.3936, 2.3936],
[1.3256, 0.7829, 2.1310, ..., 2.3936, 2.3936, 2.3936],
[0.8704, 0.2752, 1.4132, ..., 2.3936, 2.3936, 2.3936]],
[[1.7685, 1.0017, 0.5311, ..., 2.6400, 2.6400, 2.6400],
[1.7860, 1.0539, 0.5136, ..., 2.6400, 2.6400, 2.6400],
[1.6640, 1.0017, 0.4265, ..., 2.6400, 2.6400, 2.6400],
...,
[1.6291, 0.9319, 2.3263, ..., 2.6051, 2.6051, 2.6051],
[1.5420, 1.0017, 2.3437, ..., 2.6051, 2.6051, 2.6051],
[1.0888, 0.4962, 1.6291, ..., 2.6051, 2.6051, 2.6051]]]),
'targets': ['I want to use images that have\ntext in it that is written in\nmultiple lines like this.\n\nMy question is how do I prepare a data\nset for the network and load it.\n']}
Is this ok?
or Should I try anything else?
Can I go ahead with this and write other parts
or
Do I need to convert the targets to a tensor also?