Hello all,
I am writing a multi-class image classification and distance estimation network. The dataset I am using consists of images and their corresponding annotation file (containing class id, bbox, distance) as shown below
1,74,127,92,139,47
1,0,132,34,168,17
1,0,137,14,179,14
7,94,125,98,139,46
1,156,127,201,162,17
1,163,129,219,170,15
1,183,131,224,181,13
6,196,138,224,205,10
1,192,148,224,224,7
I have written custom dataloader for this task as presented below.
class CreateDataLoader(Dataset):
#constructor
def __init__(self, root_dir, transforms=None):
self.root = root_dir
self.csvs = list(sorted(os.listdir(os.path.join(root_dir, "csv")))) #Get list of all csv files
self.images = list(sorted(os.listdir(os.path.join(root_dir, "images"))))#Get list of all images
self.transforms = transforms
def __getitem__(self, index):
# acquire image, label, its bounding box coordinates and the distance to object
imagePath = os.path.join(self.root, "images", self.images[index])
filename, ext = os.path.splitext(os.path.basename(imagePath))
csvFilename = filename.replace('camera', 'CSV')
csvFile = os.path.join(self.root, "csv", (csvFilename + ".csv"))
image = Image.open(imagePath).convert("RGB")
bboxes = []
objectLabels = []
distances = []
with open(csvFile, 'r') as read_obj:
csv_reader = csv.reader(read_obj)
for row in csv_reader:
objectLabel = row[0]
Xmin = np.array(row[1])
Ymin = np.array(row[2])
Xmax = np.array(row[3])
Ymax = np.array(row[4])
distance = np.array(row[5])
bbox = np.array([Xmin, Ymin, Xmax, Ymax], dtype=int)
bboxes.append(bbox)
objectLabels.append(int(objectLabel))
distances.append(distance)
distances = np.array(distances, dtype=float)
objectLabels = np.array(objectLabels, dtype=float)
#make everything to torch tensor, important question is it required for bounding boxes??
bboxes = torch.as_tensor(bboxes, dtype=torch.float32)
objectLabels = torch.as_tensor(objectLabels, dtype=torch.float32)
distances = torch.as_tensor(distances, dtype=torch.float32)
if self.transforms is not None:
image = self.transforms(image)
return image, objectLabels, bboxes, distances
def __len__(self):
# return the size of the dataset
return len(self.images)
Since, the length of objectLabels, bboxes and distances vary with image (due to objects present in the image), I had to write a custom collate function as below
def collate_fn_Custom( batch):
"""
Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader).
"""
images = list()
boxes = list()
objectLabels = list()
distances = list()
for b in batch:
images.append(b[0])
objectLabels.append(b[1])
boxes.append(b[2])
distances.append(b[3])
images = torch.stack(images, dim=0)
return images, objectLabels, boxes, distances
It works fine for batch size of 1. But I am failing to understand how I can extend it for larger batch sizes. Of course, I can pass any batch size as an argument, but how can the model differentiate among different different images and their corresponding annotation for multiple batch size.
The source for collate function is here.