I’ve noticed a very strange bug with the Pytorch dataloader. Essentially, there seems to be an issue with the dataloader not randomly seeding each worker properly. This problem only seems to exist when multiple workers are used. Here is what I am running
import numpy as np
import torch
import torch.utils.data as data
from torch.utils.data import DataLoader
class BaseDataset(data.Dataset):
def __init__(self, num_samples=5, batch_size=2, num_workers=1):
self.num_samples = num_samples
self.batch_size = batch_size
self.num_workers = num_workers
def __len__(self):
return self.num_samples
def __getitem__(self, sample_idx):
if self.num_workers > 0:
worker_info = torch.utils.data.get_worker_info()
worker_seed = worker_info.seed
print(worker_seed, worker_info.id, np.random.randint(0, 100))
else:
print(np.random.randint(0, 100))
return 0
if __name__ == '__main__':
np.random.seed(1)
torch.random.manual_seed(1)
torch.manual_seed(1)
num_workers = 0
dummy_dataset = BaseDataset(num_samples=5, num_workers=num_workers)
dataloader = DataLoader(dummy_dataset,
batch_size=dummy_dataset.batch_size,
shuffle=False,
num_workers=num_workers,
pin_memory=True,
drop_last=True)
epochs = 5
for e in range(epochs):
print('epoch: ', int(e))
for idx, _ in enumerate(dataloader):
a = 1
print()
When I run the code with num_wokers=0
to disable multi-threading, the outputs of the random numbers are random. But with `num_workers > 0’, the outputs are no longer random.
Further more, I tested this by creating a brand new conda environment with these packages. Pip was not used to ensure consistency. I am using our work server for this work which is running Centos 7.
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
_libgcc_mutex=0.1=main
blas=1.0=mkl
ca-certificates=2020.1.1=0
certifi=2020.4.5.1=py37_0
cpuonly=1.0=0
freetype=2.9.1=h8a8886c_1
intel-openmp=2020.1=217
jpeg=9b=h024ee3a_2
ld_impl_linux-64=2.33.1=h53a641e_7
libedit=3.1.20181209=hc058e9b_0
libffi=3.3=he6710b0_1
libgcc-ng=9.1.0=hdf63c60_0
libgfortran-ng=7.3.0=hdf63c60_0
libpng=1.6.37=hbc83047_0
libstdcxx-ng=9.1.0=hdf63c60_0
libtiff=4.1.0=h2733197_0
mkl=2020.1=217
mkl-service=2.3.0=py37he904b0f_0
mkl_fft=1.0.15=py37ha843d7b_0
mkl_random=1.1.0=py37hd6b4f25_0
ncurses=6.2=he6710b0_1
ninja=1.9.0=py37hfd86e86_0
numpy=1.18.1=py37h4f9e942_0
numpy-base=1.18.1=py37hde5b4d6_1
olefile=0.46=py37_0
openssl=1.1.1g=h7b6447c_0
pillow=7.1.2=py37hb39fc2d_0
pip=20.0.2=py37_3
python=3.7.7=hcff3b4d_5
pytorch=1.5.0=py3.7_cpu_0
readline=8.0=h7b6447c_0
setuptools=46.2.0=py37_0
six=1.14.0=py37_0
sqlite=3.31.1=h62c20be_1
tk=8.6.8=hbc83047_0
torchvision=0.6.0=py37_cpu
wheel=0.34.2=py37_0
xz=5.2.5=h7b6447c_0
zlib=1.2.11=h7b6447c_3
zstd=1.3.7=h0b5b093_0