Hi, I have multiple h5 files that contains ~1000 samples each, here is my dataloader/sampler/dataset
class h5_dataset(object):
def __init__(self, h5_path):
self.h5_path = h5_path
self.h5 = None
def __getitem__(self, idx):
if self.h5 is None:
self.h5 = h5py.File(self.h5_path, 'r')
data = self.h5['dataset'][idx]
return data
class CustomDataset(Dataset):
def __init__(self, h5_path_list):
self.data_path_list = None
self.h5_dataset = []
for path in self.data_path_list:
self.h5_dataset.append(h5_dataset(path))
def __getitem__(self, idx_tuple):
assert len(idx_tuple) == 2
file_idx, idx = idx_tuple
data = self.h5_dataset[file_idx][idx]
return data
class RandInObjSampler(BatchSampler):
def __init__(self, dataset, batch_size):
self.dataset = dataset
self.batch_size = batch_size
self.obj_num = self.dataset.get_obj_num()
self.queue = queue.Queue()
self.remain_obj = None
def __len__(self):
return len(self.dataset) // self.batch_size
def __iter__(self):
self.remain_obj = [i for i in range(self.obj_num)]
for _ in range(len(self)):
if self.queue.qsize() < self.batch_size:
# Random next obj & add the permutated idx.
random.shuffle(self.remain_obj)
next_obj = self.remain_obj[0]
self.remain_obj = self.remain_obj[1:]
next_obj_sample_num = self.dataset.get_sample_num(next_obj)
map(self.queue.put, permutation_in_obj_index(next_obj, next_obj_sample_num))
# get batchsized item from the queue
items_idx = map(self.queue.get, [_ for _ in range(self.batch_size)])
yield items_idx
class FixLenDataloader(DataLoader):
def __init__(self, dataset, batch_size, pin_memory, num_workers):
batch_sampler = RandInObjSampler(dataset, batch_size)
super(FixLenDataloader, self).__init__(
dataset=dataset,
batch_sampler=batch_sampler,
pin_memory=pin_memory,
num_workers=num_workers)
and the issue here is when I just try to load the data by this script:
h5_folder = "/path/to/h5"
h5_list = glob.glob(os.path.join(h5_folder, "*.h5"))
dataset = CustomDataset(h5_list)
dataloader = CustomDataloader(
dataset,
batch_size=128,
pin_memory=torch.cuda.is_available(),
num_workers=multiprocessing.cpu_count())
for i in range(10):
print "num_epoch", i
for batch_idx, data in enumerate(dataloader):
if batch_idx % 100 == 0:
print('succees on batch {}'.format(batch_idx))
memReport()
cpuStats()
I am monitoring the memory with the scripts :How pytorch releases variable garbage?
when ~2000 batch, I get a CPU memory increase from 4.3G to 6.1G. The code also fails around 2000 batch when I am training the model. Any ideas? This is the memory trace.
2.7.6 (default, Nov 13 2018, 12:45:42)
[GCC 4.8.4]
25.2
svmem(total=269832167424, available=200663445504, percent=25.6, used=59130613760, free=8998412288, active=143371280384, inactive=85837078528, buffers=2089795584, cached=199613345792, shared=9689022464)
('memory GB:', 4.2747650146484375)
succees on batch 1500
2.7.6 (default, Nov 13 2018, 12:45:42)
[GCC 4.8.4]
32.0
svmem(total=269832167424, available=200383332352, percent=25.7, used=59333599232, free=9493114880, active=143184515072, inactive=85609123840, buffers=2089795584, cached=198915657728, shared=9768665088)
('memory GB:', 4.318531036376953)
succees on batch 1600
2.7.6 (default, Nov 13 2018, 12:45:42)
[GCC 4.8.4]
31.0
svmem(total=269832167424, available=195722665984, percent=27.5, used=64067747840, free=5217337344, active=147790471168, inactive=85312073728, buffers=2089795584, cached=198457286656, shared=9693081600)
('memory GB:', 4.31866455078125)
succees on batch 1700
2.7.6 (default, Nov 13 2018, 12:45:42)
[GCC 4.8.4]
38.1
svmem(total=269832167424, available=198720540672, percent=26.4, used=58239537152, free=12243107840, active=140344602624, inactive=86244818944, buffers=2089799680, cached=197259722752, shared=12520136704)
('memory GB:', 6.1643524169921875)
succees on batch 1800
2.7.6 (default, Nov 13 2018, 12:45:42)
[GCC 4.8.4]
23.9
svmem(total=269832167424, available=195888242688, percent=27.4, used=61950873600, free=8822206464, active=144383623168, inactive=85638770688, buffers=2089803776, cached=196969283584, shared=11642433536)
('memory GB:', 6.0920257568359375)
succees on batch 1900
2.7.6 (default, Nov 13 2018, 12:45:42)
[GCC 4.8.4]
33.9
svmem(total=269832167424, available=201348345856, percent=25.4, used=56451174400, free=14635212800, active=138813337600, inactive=85438504960, buffers=2089807872, cached=196655972352, shared=11682168832)
('memory GB:', 6.161582946777344)
succees on batch 2000
2.7.6 (default, Nov 13 2018, 12:45:42)
[GCC 4.8.4]
23.7
svmem(total=269832167424, available=195374772224, percent=27.6, used=62412275712, free=9571815424, active=144475328512, inactive=84977864704, buffers=2089807872, cached=195758268416, shared=11695505408)
('memory GB:', 6.161590576171875)
Write failed: Broken pipe