How to track the print statements in data.py - ie, the dataloader file

Megh_Bhalerao · November 6, 2019, 3:31pm

I have a couple of print statements in data.py - which is essentially the data loader file of my model. The data.py file looks like this:

from augmentations.augs import *
from augmentations.color_aug import *
from augmentations.noise_aug import *
from augmentations.spatial_augs import *
from augmentations.utils import *
import nibabel as nib
import torch
from torch.utils.data.dataset import Dataset
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import os
import random
from all_augmentations import *
from utils import *
import random
class TumorSegmentationDataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
    def __len__(self):
        return len(self.df)
    def transform(self,img ,gt):

        img, gt = img[0], gt[0]
        if random.random()<0.12:
            img, gt = augment_rot90(img, gt)
            img, gt = img.copy(), gt.copy()
        if random.random()<0.12:
            img, gt = augment_mirroring(img, gt)
            img, gt = img.copy(), gt.copy()
        if random.random()<0.12:
            img, gt = augment_rotate_angle(img, gt, 45)
            img, gt = img.copy(), gt.copy()
        if random.random()<0.12:
            img, gt = augment_rotate_angle(img, gt, 180)
            img, gt = img.copy(), gt.copy()
        if random.random() < 0.12:
            img, gt = np.fliplr(img).copy(), np.fliplr(gt).copy()
        if random.random() < 0.12:
            img[0] = gaussian(img[0],True,0,0.1)
            img[1] = gaussian(img[1],True,0,0.1)
            img[2] = gaussian(img[2],True,0,0.1)
        return img,gt

    def rcrop(self,imshape,psize):
        xshift = random.randint(0,imshape[0]-psize[0])
        yshift = random.randint(0,imshape[1]-psize[1])
        #zshift = random.randint(0,imshape[2]-psize[2])
        return xshift, yshift

    def __getitem__(self, index):

        dce_000_path = self.df.iloc[index, 0]
        dce_001_path = self.df.iloc[index, 1]
        dce_002_path = self.df.iloc[index, 2]
        gt_path = self.df.iloc[index,3]

        dce_000 = nib.load(dce_000_path).get_fdata()


dce_001 = nib.load(dce_001_path).get_fdata()
        dce_002 = nib.load(dce_002_path).get_fdata()
        gt = nib.load(gt_path).get_fdata()

        psize = (176,176,64)

        xshift, yshift= self.rcrop(dce_000.shape,psize)
        print(dce_000.shape)
        dce_000 = dce_000[xshift:xshift+psize[0],yshift:yshift+psize[1],:]
        dce_001 = dce_001[xshift:xshift+psize[0],yshift:yshift+psize[1],:]
        dce_002 = dce_002[xshift:xshift+psize[0],yshift:yshift+psize[1],:]
        gt = gt[xshift:xshift+psize[0],yshift:yshift+psize[1],:]

        dce_000 = np.expand_dims(dce_000,axis = 0)
        dce_001 = np.expand_dims(dce_001,axis = 0)
        dce_002 = np.expand_dims(dce_002,axis = 0)

        image = np.concatenate((dce_000,dce_001,dce_002),axis = 0)
        image = np.expand_dims(image, axis = 0)

        gt = np.expand_dims(np.expand_dims(gt, axis = 0),axis=0)

        image, gt = self.transform(image, gt)
        #print("Image shape",image.shape)
        #print("Gt Shape",gt.shape)

        #print(dce_000_path)    
        print(xshift,yshift)
        print(dce_000.shape)
        sample = {'image': image, 'gt' : gt}
        return sample

My batch size = 1. According to this, the three print statements must be sequential, but it comes in some random order., like this:

(512, 512, 48)
(256, 256, 64)
(256, 256, 64)
29 59
(1, 160, 160, 64)
40 68
(1, 160, 160, 64)
(256, 256, 64)
339 199
(1, 160, 160, 48)
(256, 256, 64)
(256, 256, 64)
(256, 256, 240)
82 38
(1, 160, 160, 64)
43 82
(1, 160, 160, 240)
(512, 512, 48)

This is not the right order of the print statements.
Is this something related to some cache or how the data is being accessed or something? I am very confused.

ptrblck · November 6, 2019, 3:37pm

Are you calling the Dataset directly or are you using a DataLoader?

Note that multiple workers in a DataLoader might cause the print statements to overlap.

Megh_Bhalerao · November 6, 2019, 3:43pm

I am using a DataLoader like this :

dataset_train = TumorSegmentationDataset(train_csv)
train_loader = DataLoader(dataset_train,batch_size= batch,shuffle=True, num_workers=4)

And I access my data like this:

for batch_idx, (subject) in enumerate(train_loader):
        # Load the subject and its ground truth

        image = subject['image']
        mask = subject['gt']

Oh, so you mean num_workers=4 might be causing this issue of print statements coming in the order that I do not expect?

ptrblck · November 6, 2019, 5:06pm

Yes, exactly, since each worker is an own process, the print statements might overlap.
For debugging purposes you could just set num_workers=0.

Megh_Bhalerao · November 6, 2019, 5:57pm

Thanks, I am doing that right now. What is num_workers exactly? Is it like the number of threads or something?

ptrblck · November 6, 2019, 9:29pm

It defines the number of processes to prefetch each batch.