[DataLoader] num_of_worker>0 error

Moonlight · July 31, 2019, 1:43am

i use window 10 pro, graphic card geforce 1050ti, and anaconda spyder python3.7

I am learning about Unet and practice code.

but i have some problem. Maybe… enumerate{dataLoader]) make error.

==============
i use this code for dataset and dataloader

class Dataset(object):
def init(self, fname ,img_transform=None, mask_transform = None, edge_weight= False):
#nothing special here, just internalizing the constructor parameters
self.fname=fname
self.edge_weight = edge_weight

    self.img_transform=img_transform
    self.mask_transform = mask_transform
    
    self.tables=tables.open_file(self.fname)
    self.numpixels=self.tables.root.numpixels[:]
    self.nitems=self.tables.root.img.shape[0]
    self.tables.close()
    
    self.img = None
    self.mask = None
    
def __getitem__(self, index):
    #opening should be done in __init__ but seems to be
    #an issue with multithreading so doing here
    with tables.open_file(self.fname,'r') as db:
        self.img=db.root.img
        self.mask=db.root.mask
   
        #get the requested image and mask from the pytable
        img = self.img[index,:,:,:]
        mask = self.mask[index,:,:]
    
    #the original Unet paper assignes increased weights to the edges of the annotated objects
    #their method is more sophistocated, but this one is faster, we simply dilate the mask and 
    #highlight all the pixels which were "added"
    if(self.edge_weight):
        weight = scipy.ndimage.morphology.binary_dilation(mask==1, iterations =2) & ~mask
    else: #otherwise the edge weight is all ones and thus has no affect
        weight = np.ones(mask.shape,dtype=mask.dtype)
    
    mask = mask[:,:,None].repeat(3,axis=2) #in order to use the transformations given by torchvision
    weight = weight[:,:,None].repeat(3,axis=2) #inputs need to be 3D, so here we convert from 1d to 3d by repetition
    
    img_new = img
    mask_new = mask
    weight_new = weight
    
    seed = random.randrange(sys.maxsize) #get a random seed so that we can reproducibly do the transofrmations
    if self.img_transform is not None:
        random.seed(seed) # apply this seed to img transforms
        img_new = self.img_transform(img)

    if self.mask_transform is not None:
        random.seed(seed)
        mask_new = self.mask_transform(mask)
        mask_new = np.asarray(mask_new)[:,:,0].squeeze()
        
        random.seed(seed)
        weight_new = self.mask_transform(weight)
        weight_new = np.asarray(weight_new)[:,:,0].squeeze()

    return img_new, mask_new, weight_new
def __len__(self):
    return self.nitems

In[ ]:

#note that since we need the transofrmations to be reproducible for both masks and images
#we do the spatial transformations first, and afterwards do any color augmentations
img_transform = transforms.Compose([
transforms.ToPILImage(),
transforms.RandomVerticalFlip(),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(size=(patch_size,patch_size),pad_if_needed=True), #these need to be in a reproducible order, first affine transforms and then color
transforms.RandomResizedCrop(size=patch_size),
transforms.RandomRotation(180),
transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=.5),
transforms.RandomGrayscale(),
transforms.ToTensor()
])

mask_transform = transforms.Compose([
transforms.ToPILImage(),
transforms.RandomVerticalFlip(),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(size=(patch_size,patch_size),pad_if_needed=True), #these need to be in a reproducible order, first affine transforms and then color
transforms.RandomResizedCrop(size=patch_size,interpolation=PIL.Image.NEAREST),
transforms.RandomRotation(180),
])

dataset={}
dataLoader={}
for phase in phases: #now for each of the phases, we’re creating the dataloader
#interestingly, given the batch size, i’ve not seen any improvements from using a num_workers>0

dataset[phase]=Dataset(f"./{dataname}_{phase}.pytable", img_transform=img_transform , mask_transform = mask_transform ,edge_weight=edge_weight)
dataLoader[phase]=DataLoader(dataset[phase], batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)

=======================================

Then, using this code, some error occurs.

writer=SummaryWriter() #open the tensorboard visualiser
best_loss_on_test = np.Infinity
edge_weight=torch.tensor(edge_weight).to(device)
start_time = time.time()
for epoch in range(num_epochs):
#zero out epoch based performance variables
all_acc = {key: 0 for key in phases}
all_loss = {key: torch.zeros(0).to(device) for key in phases}
cmatrix = {key: np.zeros((2,2)) for key in phases}

for phase in phases: #iterate through both training and validation states

    if phase == 'train':
        model.train()  # Set model to training mode
    else: #when in eval mode, we don't want parameters to be updated
        model.eval()   # Set model to evaluate mode

    for ii , [X, y, y_weight] in enumerate(dataLoader[phase]): #for each of the batches
        X = X.to(device)  # [Nbatch, 3, H, W]
        y_weight = y_weight.type('torch.FloatTensor').to(device)
        y = y.type('torch.LongTensor').to(device)  # [Nbatch, H, W] with class indices (0, 1)

        with torch.set_grad_enabled(phase == 'train'): #dynamically set gradient computation, in case of validation, this isn't needed
                                                        #disabling is good practice and improves inference time

            prediction = model(X)  # [N, Nclass, H, W]
            loss_matrix = criterion(prediction, y)
            loss = (loss_matrix * (edge_weight**y_weight)).mean() #can skip if edge weight==1

            if phase=="train": #in case we're in train mode, need to do back propogation
                optim.zero_grad()
                loss.backward()
                optim.step()
                train_loss = loss


            all_loss[phase]=torch.cat((all_loss[phase],loss.detach().view(1,-1)))

            if phase in validation_phases: #if this phase is part of validation, compute confusion matrix
                p=prediction[:,:,:,:].detach().cpu().numpy()
                cpredflat=np.argmax(p,axis=1).flatten()
                yflat=y.cpu().numpy().flatten()

                cmatrix[phase]=cmatrix[phase]+confusion_matrix(yflat,cpredflat,labels=range(n_classes))

    all_acc[phase]=(cmatrix[phase]/cmatrix[phase].sum()).trace()
    all_loss[phase] = all_loss[phase].cpu().numpy().mean()

    #save metrics to tensorboard
    writer.add_scalar(f'{phase}/loss', all_loss[phase], epoch)
    if phase in validation_phases:
        writer.add_scalar(f'{phase}/acc', all_acc[phase], epoch)
        writer.add_scalar(f'{phase}/TN', cmatrix[phase][0,0], epoch)
        writer.add_scalar(f'{phase}/TP', cmatrix[phase][1,1], epoch)
        writer.add_scalar(f'{phase}/FP', cmatrix[phase][0,1], epoch)
        writer.add_scalar(f'{phase}/FN', cmatrix[phase][1,0], epoch)
        writer.add_scalar(f'{phase}/TNR', cmatrix[phase][0,0]/(cmatrix[phase][0,0]+cmatrix[phase][0,1]), epoch)
        writer.add_scalar(f'{phase}/TPR', cmatrix[phase][1,1]/(cmatrix[phase][1,1]+cmatrix[phase][1,0]), epoch)

print('%s ([%d/%d] %d%%), train loss: %.4f test loss: %.4f' % (timeSince(start_time, (epoch+1) / num_epochs), epoch+1, num_epochs ,(epoch+1) / num_epochs * 100, all_loss["train"], all_loss["val"]),end="")    

#if current loss is the best we've seen, save model state with all variables
#necessary for recreation
if all_loss["val"] < best_loss_on_test:
    best_loss_on_test = all_loss["val"]
    print("  **")
    state = {'epoch': epoch + 1,
     'model_dict': model.state_dict(),
     'optim_dict': optim.state_dict(),
     'best_loss_on_test': all_loss,
     'n_classes': n_classes,
     'in_channels': in_channels,
     'padding': padding,
     'depth': depth,
     'wf': wf,
     'up_mode': up_mode, 'batch_norm': batch_norm}


    torch.save(state, f"{dataname}_unet_best_model.pth")
else:
    print("")

error is…

ipdb> Traceback (most recent call last):

File “”, line 1, in
debugfile(‘C:/Users/mbmhm/Desktop/unet/train_unet.py’, wdir=‘C:/Users/mbmhm/Desktop/unet’)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 856, in debugfile
debugger.run(“runfile(%r, args=%r, wdir=%r)” % (filename, args, wdir))

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\bdb.py”, line 585, in run
exec(cmd, globals, locals)

File “”, line 1, in

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 827, in runfile
execfile(filename, namespace)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 110, in execfile
exec(compile(f.read(), filename, ‘exec’), namespace)

File “c:/users/mbmhm/desktop/unet/train_unet.py”, line 265, in
for ii , [X, y, y_weight] in enumerate(dataLoader[phase]): #for each of the batches

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\torch\utils\data\dataloader.py”, line 193, in iter
return _DataLoaderIter(self)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\torch\utils\data\dataloader.py”, line 469, in init
w.start()

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\multiprocessing\process.py”, line 112, in start
self._popen = self._Popen(self)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\multiprocessing\context.py”, line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\multiprocessing\context.py”, line 322, in _Popen
return Popen(process_obj)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\multiprocessing\popen_spawn_win32.py”, line 89, in init
reduction.dump(process_obj, to_child)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\multiprocessing\reduction.py”, line 60, in dump
ForkingPickler(file, protocol).dump(obj)

File “stringsource”, line 2, in tables.hdf5extension.Array.reduce_cython

TypeError: self.dims,self.dims_chunk,self.maxdims cannot be converted to a Python object for pickling

===================================
I use different code, but error occer.

for x,y,w in dataLoader[‘train’]:
print(x.shape, y.shape, w.shape)

ipdb> _CudaDeviceProperties(name=‘GeForce GTX 1050 Ti’, major=6, minor=1, total_memory=4096MB, multi_processor_count=6)
total params: 122466

ipdb> Traceback (most recent call last):

File “”, line 1, in
debugfile(‘C:/Users/mbmhm/Desktop/unet/train_unet.py’, wdir=‘C:/Users/mbmhm/Desktop/unet’)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 856, in debugfile
debugger.run(“runfile(%r, args=%r, wdir=%r)” % (filename, args, wdir))

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\bdb.py”, line 585, in run
exec(cmd, globals, locals)

File “”, line 1, in

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 827, in runfile
execfile(filename, namespace)

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 110, in execfile
exec(compile(f.read(), filename, ‘exec’), namespace)

File “c:/users/mbmhm/desktop/unet/train_unet.py”, line 200, in
for x,y,w in dataLoader[‘train’]:

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\torch\utils\data\dataloader.py”, line 576, in next
idx, batch = self._get_batch()

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\torch\utils\data\dataloader.py”, line 543, in _get_batch
success, data = self._try_get_batch()

File “C:\Users\mbmhm\ansel\Anaconda3\envs\moongpu\lib\site-packages\torch\utils\data\dataloader.py”, line 519, in _try_get_batch
raise RuntimeError(‘DataLoader worker (pid(s) {}) exited unexpectedly’.format(pids_str))

RuntimeError: DataLoader worker (pid(s) 500, 6872) exited unexpectedly

what can i do to solve this problem.

ptrblck · August 2, 2019, 12:03pm

Could you run your code with num_workers=0 and check, if you get a proper error message?
Maybe something raises an error in your __getitem__, which might be lost due to multiprocessing.