Multiprocessing (num_of_workers) on Windows

Hello, I recently upgraded my Nvidia drivers, Cuda toolkit to 10.2 and cudNN so I can use tensorboard however, now I have a new error when setting num_of_workers>0

RuntimeError: cuda runtime error (71) : operation not supported at C:\w\1\s\tmp_conda_3.7_055457\conda\conda-bld\pytorch_1565416617654\work\torch/csrc/generic/StorageSharing.cpp:245

I operating on windows 10. I tried to follow the recommended method of using the data loaders inside a function:
import torch

def main()
for i, data in enumerate(dataloader):
# do something here

if name == ‘main’:

I know some of the multiprocessing function is not supported on windows however, it used to work before. I also tried downgrading the drivers but it did not fix the problem :frowning: HELP

@peterjc123 Do you know what could be happening here?

Show me your code, please.

@peterjc123 I still have the problem, and training my model is very slow. I just tried updating CUDA toolkit to 10.1 now.

class PermDataset(Dataset):
    """Perm dataset."""

    def __init__(self, csv_file, img_dir, Brea, Benth, GH, Leo, Nav, Bn, CL_20, CL_15, CL_10):
            csv_file (string): Path to the csv file with labels
            img_dir (string): Directory with all the images.

           return: subvolume (3d numpy array), ID of subvolume, and label

        self.file_name = pd.read_csv(open(csv_file, 'rU'), encoding='utf-8', engine='c')
        self.img_dir = img_dir
        self.Brea = Brea             #3D numpy array
        self.Benth = Benth             #3D numpy array
        self.GH = GH               #3D numpy array
        self.Leo = Leo               #3D numpy array
        self.Nav = Nav                  #3D numpy array
        self.Bn = Bn                 #3D numpy array
        self.CL_20 = CL_20             #3D numpy array
        self.CL_15 = CL_15                   #3D numpy array
        self.CL_10 = CL_10               #3D numpy array

    def __len__(self):
        return len(self.file_name)

    def __getitem__(self, idx):
        1- get the index of training
        2- get the name of rock and id of subvolume
        3- open the image and reterive the subvolume
        4- find the corrsponding permeability value from csv file
        idx_name = self.file_name.iloc[idx, 1]
        idx_label = self.file_name.iloc[idx, 4]

        idx_direction = idx_name[-2]
        name_string ='[A-Z]...', idx_name).group(0)
        digits_in_string ='[0-9]+', idx_name).group(0)
        s1 = int(''.join(digits_in_string))
        # assign image

        if 'Br' in name_string:
            img = self.Brea
        elif 'Ben' in name_string:
            img = self.Benth
        elif 'BnD' in name_string:
            img = self.Bn
        elif 'ND3' in name_string:
            img = self.Nav
        elif 'Leo' in name_string:
            img = self.Leo
        elif 'GHD' in name_string:
            img = self.GH
        elif 'CL20' in name_string:
            img = self.CL_20
        elif 'CL15' in name_string:
            img = self.CL_15
        elif 'CL10' in name_string:
            img = self.CL_10
            print('no matching image')

        cube = shift(img, 64, 32, s1, idx_direction) #function for getting a subvolume from the 3d matrix
        cube = np.expand_dims(cube, 0)

        return cube, idx_label, idx_name

my main code structure:

def trainProcess(load_model=False):
    # writer = SummaryWriter()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # model = CNN3D().float()
    model = resnet50().float()
    criterion = torch.nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    best_val = float(0)
    early_stopping = EarlyStopping(patience=15, verbose=True)
    Brea_image = np.load(os.path.join(img_path, 'BrD2.npy'))
    Benth_image = np.load(os.path.join(img_path, 'BenD2.npy'))
    GH_image = np.load(os.path.join(img_path, 'GHD3.npy'))
    Leo_image = np.load(os.path.join(img_path, 'LeoD3.npy'))
    ND_image = np.load(os.path.join(img_path, 'ND3.npy'))
    Bn_image = np.load(os.path.join(img_path, 'BnD3.npy'))
    CL_20 = np.load(os.path.join(img_path, 'CL_20.npy'))
    CL_15 = np.load(os.path.join(img_path, 'CL_15.npy'))
    CL_10 = np.load(os.path.join(img_path, 'CL_10.npy'))

    # Dataset Split
    jobid = r"C:\Users\drn-4\Desktop\3D\logs_loss\Regresion{}".format(time.strftime("%Y%m%d-%H%M%S"))
    dataset = PermDataset(labels_file, img_path, Brea_image, Benth_image, GH_image, Leo_image, ND_image, Bn_image,
                          CL_20, CL_15, CL_10)
    batch_size = 3
    validation_split = .25
    testing_split = 0.5
    shuffle_dataset = True
    random_seed = 4
    validate_every = 1

    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))

    if shuffle_dataset:
    train_indices, val_test_indices = indices[split:], indices[:split]

    split1 = int(np.floor(testing_split * len(val_test_indices)))

    val_indices, test_indices = val_test_indices[split1:], val_test_indices[:split1]

    # Creating PT data samplers and loaders:
    train_dataset1 =, train_indices)
    valid_dataset1 =, val_indices)
    test_dataset1 =, test_indices)

    train_loader =, batch_size=batch_size,
                                               num_workers=4, shuffle=True, pin_memory=False)
    validation_loader =, batch_size=batch_size,
                                                    num_workers=0, shuffle=True, pin_memory=False)

    test_loader =, batch_size=batch_size,
                                              num_workers=0, shuffle=True, pin_memory=False)
    win = viz.scatter(
        X=np.array([[1, 2], [3, 4]]),

    # Now, let's start the training process!
    for epoch in range(100):
        # Compute a training epoch
        loss = trainEpoch(train_loader, model, criterion, optimizer, epoch, device)

        # Compute a validation epoch
        if epoch % validate_every == 0:
            val_loss = valEpoch(device, validation_loader, model, criterion, epoch, win)

        # Print validation accuracy and best validation accuracy
        is_best = bool(val_loss > best_val)
        best_val = max(val_loss, best_val)
        print('** Validation: %f (best) - %f (current)' % (best_val, val_loss))

        early_stopping(val_loss, model, epoch, optimizer)

        if early_stopping.early_stop:
            print("Early stopping")

        if epoch == 0:
                'train_indices': train_indices, 'validation_indices': val_indices, 'test_indices': test_indices
            }, r'C:\Users\drn-4\Desktop\3D\checkdir\train_val_test_indices.pth'.format(time.strftime("%Y%m%d-%H%M%S"),

if __name__ == "__main__":
    global writer
    writer = SummaryWriter()
    DEFAULT_PORT = 8097
    DEFAULT_HOSTNAME = "http://localhost"
    viz = Visdom(port=DEFAULT_PORT, server=DEFAULT_HOSTNAME, base_url='/', username='',


I guess the problem should be in the dataset itself. What is the function shift in PermDataset.__getitem__ doing?

Hi Peter,


the function takes the image (3d numpy array) and get a subvolume according to the key I pass to the function. I also do some preprocessing (removing the unconnected objects of the image using a python package).

Regardless, it is not clear to me what is the best practices for using multiprocessing if I want to load 3d numpy arrays (646464) for training, with their labels. I have around 17000 data points for training. I read the multiprocessing best practices in pytorch documentation but I did not get much that would give an indictation to the fastest way for loading such data. any suggestions

def shift(img, Bs, sh, key, direction):
   "get a subvolume from the bigger image according to the given key"
   "Bs is the subvolume size in voxels default=64"
   "sh is the shift in voxels at each step similar to stride in deep learning"
   "key is the subvolume Id"

   key = key - 1
   "transpose 3D matrix according to the direction of Flow"

   if direction == 'Z':
   if direction == 'X':
       img = np.transpose(img, [2, 1, 0])
   if direction == 'Y':
       img = np.transpose(img, [0, 2, 1])

   # img[img == 0] = 2
   Nx = img.shape[0]
   Ny = img.shape[1]
   Nz = img.shape[2]

   Nbx = np.floor((Nx - Bs) / sh).astype(int)
   Nby = np.floor((Ny - Bs) / sh).astype(int)
   Nbz = np.floor((Nz - Bs) / sh).astype(int)

   [i, j, k] = np.unravel_index(key, [Nbx + 1, Nby + 1, Nbz + 1], 'F')

   starti = sh * i;
   endi = sh * i + Bs;
   startj = sh * j;
   endj = sh * j + Bs;
   startk = sh * k;
   endk = sh * k + Bs;

   cube = img[starti:endi, startj:endj, startk:endk]

   # Remove unconnected parts
   labels_inv = util.invert(cube)
   labels_in = labels_inv.astype(np.int32)

   connectivity = 6  # only 26, 18, and 6 are allowed
   labels_out = cc3d.connected_components(labels_in, connectivity=connectivity)  # connected components labels
   N = np.max(labels_out)  # number of labels
   for segid in range(1, N + 1):  # remove smaller parts
       if labels_out[labels_out == segid].size < labels_in.shape[0]:  # 64
           labels_out[labels_out == segid] = 0
   if N > 1:
       for segid in range(1, N + 1):
           z = np.unravel_index(np.where(labels_out == segid),
                                (labels_in.shape[0], labels_in.shape[0], labels_in.shape[0]), 'F')[0][2]
           numIn = np.sum(z == 0)
           numOut = np.sum(z == labels_in.shape[2]-1)
           if numIn == 0:
               labels_out[labels_out == segid] = 0
           elif numOut == 0:
               labels_out[labels_out == segid] = 0
               labels_out[labels_out == segid] = 1

   return labels_out.astype('float64')

Update !

just out of curiosity I ran the same exact code in jupyter notebook with num_workers=6 and it works just fine. I was initially running my code using pycharm with an an anaconda environment as python interpreter. What could be the issue?

I think I have the same problem also on pycharm and anaconda

Time to change to Linux

yes Idont think it is working properly with windows