Bug
I implement a iterable dataset and use dataloader to generate batch data,
when num worker set above 0, I find after epoch 0 (epoch 1,2,3…) will miss data.
like in epoch 0 has 40 batches but when in epoch 1,2,3 there are only 20 batches (equal to num_worker)
it seems in epoch 1,2,3, each workers onle read the first chunk of a textfilereader provided by pandas.
class pandasDataset(IterableDataset):
def __init__(self, filelist, D, d):
self.filelist = filelist
self.iterlist = []
self.D = D
self.d = d
for filepath in self.filelist:
data_iter = pd.read_csv(filepath,
iterator=True,
chunksize=CHUNKSIZE,
engine="c",
error_bad_lines=False,
warn_bad_lines=False)
self.iterlist.append(data_iter)
def calStepLoc(self, L, D, d):
start = 0
end = 0
count = 0
while end != L:
start = 0 + count * (D + d)
end = min(start + D, L)
count = count + 1
yield start, end
def calDist(self, data):
L = data.shape[0]
D = self.D
d = self.d
loc = self.calStepLoc(L, D, d)
dist = []
for step, (start, end) in enumerate(loc):
twdata = data[start:end, :]
tw = twdata[:, -1].unsqueeze(dim=0).matmul(
twdata[:, 0:-1]).squeeze(dim=0).float()
twsum = tw.sum(dim=0).float().unsqueeze(dim=0)
twdist = tw.div(twsum).unsqueeze(dim=0) # 每个时间窗口内的数据分布
dist.append(twdist)
return torch.cat(dist, dim=0)
def __iter__(self):
worker_info = torch.utils.data.get_worker_info()
if worker_info is None:
list_start = 0
list_end = len(self.iterlist) - 1
else:
worker_id = worker_info.id
per_iternums = int(
math.ceil(len(self.iterlist) / float(worker_info.num_workers)))
list_start = 0 + worker_id * per_iternums
list_end = min(list_start + per_iternums, len(self.iterlist))
for fileind, dataiter in enumerate(self.iterlist[list_start:list_end]):
for step,data in enumerate(dataiter):
# pad_data, length = self.pad(torch.tensor(np.array(data)))
data = torch.tensor(np.array(data))
distdata = self.calDist(data)
yield distdata
del data
train_list = []
test_list = []
for index, filename in enumerate(filelist):
if index % 5 == 0:
test_list.append(filename)
else:
train_list.append(filename)
train_dataset = pandasDataset(train_list, D, d)
test_dataset = pandasDataset(test_list, D, d)
train_dataloader = DataLoader(train_dataset,
batch_size=BATCHSIZE,
num_workers=10,
pin_memory = True,
collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset,
batch_size=BATCHSIZE,
num_workers=10,
pin_memory = True,
collate_fn=collate_fn)
Environment
Collecting environment information...
PyTorch version: 1.10.0
Is debug build: False
CUDA used to build PyTorch: 11.3
ROCM used to build PyTorch: N/A
OS: Ubuntu 20.04.3 LTS (x86_64)
GCC version: (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Clang version: Could not collect
CMake version: Could not collect
Libc version: glibc-2.31
Python version: 3.8.12 (default, Oct 12 2021, 13:49:34) [GCC 7.5.0] (64-bit runtime)
Python platform: Linux-5.11.0-38-generic-x86_64-with-glibc2.17
Is CUDA available: True
CUDA runtime version: 10.1.243
GPU models and configuration: GPU 0: Quadro GV100
Nvidia driver version: 470.74
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Versions of relevant libraries:
[pip3] numpy==1.19.2
[pip3] numpydoc==1.1.0
[pip3] torch==1.10.0
[pip3] torchaudio==0.10.0
[pip3] torchvision==0.11.1
[conda] _pytorch_select 0.1 cpu_0 defaults
[conda] blas 1.0 mkl defaults
[conda] cudatoolkit 11.3.1 h2bc3f7f_2 defaults
[conda] ffmpeg 4.3 hf484d3e_0 pytorch
[conda] libmklml 2019.0.5 0 defaults
[conda] mkl 2020.2 256 defaults
[conda] mkl-service 2.3.0 py38he904b0f_0 defaults
[conda] mkl_fft 1.3.0 py38h54f3939_0 defaults
[conda] mkl_random 1.1.1 py38h0573a6f_0 defaults
[conda] numpy 1.19.2 py38h54aff64_0 defaults
[conda] numpy-base 1.19.2 py38hfa32c7d_0 defaults
[conda] numpydoc 1.1.0 pyhd3eb1b0_1 defaults
[conda] pytorch 1.10.0 py3.8_cuda11.3_cudnn8.2.0_0 pytorch
[conda] pytorch-mutex 1.0 cuda pytorch
[conda] torchaudio 0.10.0 py38_cu113 pytorch
[conda] torchvision 0.11.1 py38_cu113 pytorch